Module sagemaker_defect_detection.detector

None

View Source

# mypy: ignore-errors

from typing import Optional

from pathlib import Path

from os import path as osp

from collections import OrderedDict

from argparse import ArgumentParser, Namespace

from multiprocessing import cpu_count

import os

import math

import sys

import numpy as np

import torch

import torch.nn as nn

import torch.optim as optim

from torch.optim import lr_scheduler

from torch.utils.data import DataLoader

from torchvision.models.detection.image_list import ImageList

import pytorch_lightning as pl

from pytorch_lightning.core.decorators import auto_move_data

from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

from sagemaker_defect_detection import Detection, NEUDET, Classification, RPN, RoI, get_augmentation, get_preprocess

from sagemaker_defect_detection.utils.coco_eval import CocoEvaluator

from sagemaker_defect_detection.utils.coco_utils import convert_to_coco_api

from sagemaker_defect_detection.utils import freeze, load_checkpoint

class DDNDetection(pl.LightningModule):

    def __init__(

        self,

        train_rpn: bool,

        train_roi: bool,

        finetune_rpn: bool,

        finetune_roi: bool,

        data_path: str,

        backbone: str,

        num_classes: int,

        learning_rate: float,

        batch_size: int,

        momentum: float,

        weight_decay: float,

        seed: int,

        pretrained_mfn_ckpt: Optional[str] = None,

        pretrained_rpn_ckpt: Optional[str] = None,

        pretrained_roi_ckpt: Optional[str] = None,

        finetuned_rpn_ckpt: Optional[str] = None,

        finetuned_roi_ckpt: Optional[str] = None,

        resume_sagemaker_from_checkpoint: Optional[str] = None,

        **kwargs,

    ) -> None:

        super().__init__()

        self.train_rpn = train_rpn

        self.train_roi = train_roi

        self.finetune_rpn = finetune_rpn

        self.finetune_roi = finetune_roi

        self.data_path = data_path

        self.backbone = backbone

        self.num_classes = num_classes

        self.learning_rate = learning_rate

        self.batch_size = batch_size

        self.momentum = momentum

        self.weight_decay = weight_decay

        self.seed = seed

        self.train_dataset = NEUDET(

            self.data_path,

            split="train",

            augmentation=get_augmentation("train"),

            preprocess=get_preprocess(),

            seed=self.seed,

        )

        self.val_dataset = NEUDET(

            self.data_path,

            split="val",

            augmentation=get_augmentation("val"),

            preprocess=get_preprocess(),

            seed=self.seed,

        )

        self.pretrained_mfn_ckpt = pretrained_mfn_ckpt

        self.pretrained_rpn_ckpt = pretrained_rpn_ckpt

        self.pretrained_roi_ckpt = pretrained_roi_ckpt

        self.finetuned_rpn_ckpt = finetuned_rpn_ckpt

        self.finetuned_roi_ckpt = finetuned_roi_ckpt

        self.resume_sagemaker_from_checkpoint = resume_sagemaker_from_checkpoint

        self.coco_evaluator = self._get_evaluator(self.val_dataset)

    def setup(self, stage) -> None:

        if self.train_rpn:  # step 2

            self.mfn = load_checkpoint(

                Classification(self.backbone, self.num_classes - 1).mfn, self.pretrained_mfn_ckpt, "model.mfn"

            )

            self.rpn = RPN()

        elif self.train_roi:  # step 3

            self.mfn = load_checkpoint(

                Classification(self.backbone, self.num_classes - 1).mfn, self.pretrained_rpn_ckpt, prefix="mfn"

            )

            freeze(self.mfn)

            self.rpn = load_checkpoint(RPN(), self.pretrained_rpn_ckpt, prefix="rpn")

            freeze(self.rpn)

            self.roi = RoI(self.num_classes)

        elif self.finetune_rpn:  # step 4 or extra finetune rpn

            if self.finetuned_rpn_ckpt and self.finetuned_roi_ckpt:  # extra finetune rpn

                self.mfn = load_checkpoint(

                    Classification(self.backbone, self.num_classes - 1).mfn, self.finetuned_rpn_ckpt, prefix="mfn"

                )

                freeze(self.mfn)

                self.rpn = load_checkpoint(RPN(), self.finetuned_rpn_ckpt, prefix="rpn")

                self.roi = load_checkpoint(RoI(self.num_classes), self.finetuned_roi_ckpt, prefix="roi")

                freeze(self.roi)

                self.model = Detection(self.mfn, self.rpn, self.roi)

            else:

                self.mfn = load_checkpoint(

                    Classification(self.backbone, self.num_classes - 1).mfn, self.pretrained_rpn_ckpt, prefix="mfn"

                )

                freeze(self.mfn)

                self.rpn = load_checkpoint(RPN(), self.pretrained_rpn_ckpt, prefix="rpn")

                self.roi = load_checkpoint(RoI(self.num_classes), self.pretrained_roi_ckpt, prefix="roi")

                freeze(self.roi)

                self.model = Detection(self.mfn, self.rpn, self.roi)

        elif self.finetune_roi:  # step 5 or extra finetune roi

            if self.finetuned_rpn_ckpt and self.finetuned_roi_ckpt:  # extra finetune roi

                self.mfn = load_checkpoint(

                    Classification(self.backbone, self.num_classes - 1).mfn, self.finetuned_rpn_ckpt, prefix="mfn"

                )

                freeze(self.mfn)

                self.rpn = load_checkpoint(RPN(), self.finetuned_rpn_ckpt, prefix="rpn")

                freeze(self.rpn)

                self.roi = load_checkpoint(RoI(self.num_classes), self.finetuned_roi_ckpt, prefix="roi")

                self.model = Detection(self.mfn, self.rpn, self.roi)

            else:

                self.mfn = load_checkpoint(

                    Classification(self.backbone, self.num_classes - 1).mfn, self.finetuned_rpn_ckpt, prefix="mfn"

                )

                freeze(self.mfn)

                self.rpn = load_checkpoint(RPN(), self.finetuned_rpn_ckpt, prefix="rpn")

                freeze(self.rpn)

                self.roi = load_checkpoint(RoI(self.num_classes), self.pretrained_roi_ckpt, prefix="roi")

                self.model = Detection(self.mfn, self.rpn, self.roi)

        else:  # step 6: final/joint model

            load_checkpoint_fn = load_checkpoint

            if self.finetuned_roi_ckpt is not None:

                ckpt_path = self.finetuned_rpn_ckpt

            elif self.resume_sagemaker_from_checkpoint is not None:

                ckpt_path = self.resume_sagemaker_from_checkpoint

            else:

                ckpt_path = None

                # ignore load_checkpoint

                load_checkpoint_fn = lambda *args: args[0]

            self.mfn = load_checkpoint_fn(Classification(self.backbone, self.num_classes - 1).mfn, ckpt_path, "mfn")

            self.rpn = load_checkpoint_fn(RPN(), ckpt_path, "rpn")

            self.roi = load_checkpoint_fn(RoI(self.num_classes), ckpt_path, "roi")

            self.model = Detection(self.mfn, self.rpn, self.roi)

        return

    @auto_move_data

    def forward(self, images, *args, **kwargs):

        if self.train_rpn:  # step 2

            images = torch.stack(images)

            features = self.mfn(images)

            features = OrderedDict({str(i): t.unsqueeze(0) for i, t in enumerate(features)})

            images = ImageList(images, [(224, 224)])

            return self.rpn(images, features, targets=kwargs.get("targets"))

        elif self.train_roi:  # step 3

            self.mfn.eval()

            self.rpn.eval()

            images = torch.stack(images)

            features = self.mfn(images)

            features = OrderedDict({str(i): t.unsqueeze(0) for i, t in enumerate(features)})

            images = ImageList(images, [(224, 224)])

            proposals, _ = self.rpn(images, features, targets=None)

            return self.roi(features, proposals, [(224, 224)], targets=kwargs.get("targets"))

        elif self.finetune_rpn:

            self.model.backbone.eval()

            self.model.roi_heads.eval()

            return self.model(images, targets=kwargs.get("targets"))

        elif self.finetune_roi:

            self.model.backbone.eval()

            self.model.rpn.eval()

            return self.model(images, targets=kwargs.get("targets"))

        else:

            return self.model(images, targets=kwargs.get("targets"))

    def _get_evaluator(self, dataset):

        coco = convert_to_coco_api(dataset)

        return CocoEvaluator(coco, ["bbox"])

    def train_dataloader(self):

        train_loader = DataLoader(

            dataset=self.train_dataset,

            batch_size=self.batch_size,

            collate_fn=self.train_dataset.collate_fn,

            shuffle=True,

            num_workers=cpu_count(),

        )

        return train_loader

    def val_dataloader(self):

        val_loader = DataLoader(

            self.val_dataset,

            batch_size=self.batch_size,

            collate_fn=self.val_dataset.collate_fn,

            shuffle=False,

            num_workers=cpu_count() // 2,

        )

        self.coco_evaluator = self._get_evaluator(val_loader.dataset)

        return val_loader

    def configure_optimizers(self):

        params = [p for p in self.parameters() if p.requires_grad]

        optimizer = optim.SGD(params, lr=self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay)

        return optimizer

    def training_step(self, batch, batch_idx):

        images, targets, _ = batch

        if self.train_rpn:

            targets = [{"boxes": t["boxes"]} for t in targets]

            _, loss_dict = self(images, targets=targets)

            loss = sum(loss for loss in loss_dict.values())

            return OrderedDict({"loss": loss, "progress_bar": loss_dict, "log": loss_dict})

        elif self.train_roi:

            _, loss_dict = self(images, targets=targets)

            loss = sum(loss for loss in loss_dict.values())

            return OrderedDict({"loss": loss, "progress_bar": loss_dict, "log": loss_dict})

        else:

            images = list(image for image in images)

            targets = [{k: v for k, v in t.items()} for t in targets]

            loss_dict = self(images, targets=targets)

            # loss keys: ['loss_classifier', 'loss_box_reg', 'loss_objectness', 'loss_rpn_box_reg']

            loss = sum(loss for loss in loss_dict.values())

            if not math.isfinite(loss.item()):

                sys.exit(1)

            return OrderedDict({"loss": loss, "progress_bar": loss_dict, "log": loss_dict})

    @auto_move_data

    def validation_step(self, batch, batch_idx):

        images, targets, _ = batch

        if self.train_rpn:  # rpn doesn't compute loss for val

            return {}

        elif self.train_roi:

            # TODO: scores are predictions scores, not a metric! iou? + acc?

            return {}

        else:

            images = list(image for image in images)

            targets = [{k: v for k, v in t.items()} for t in targets]

            outputs = self(images, targets=targets)

            ret = {target["image_id"].item(): output for target, output in zip(targets, outputs)}

            self.coco_evaluator.update(ret)

            return {}

    @auto_move_data

    def validation_epoch_end(self, outputs):

        if self.train_rpn:

            return {}

        elif self.train_roi:

            # TODO: above

            return {}

        else:

            self.coco_evaluator.synchronize_between_processes()

            self.coco_evaluator.accumulate()

            self.coco_evaluator.summarize()

            metric = self.coco_evaluator.coco_eval["bbox"].stats[0]

            metric = torch.as_tensor(metric)

            tensorboard_logs = {"main_score": metric}

            self.coco_evaluator = self._get_evaluator(self.val_dataset)  # need to update for the new evaluation

            return {"val_loss": metric, "log": tensorboard_logs, "progress_bar": tensorboard_logs}

    @staticmethod

    def add_model_specific_args(parent_parser):  # pragma: no-cover

        parser = ArgumentParser(parents=[parent_parser], add_help=False)

        aa = parser.add_argument

        aa("--train-rpn", action="store_true")

        aa("--train-roi", action="store_true")

        aa("--finetune-rpn", action="store_true")

        aa("--finetune-roi", action="store_true")

        aa("--data-path", metavar="DIR", type=str, default=os.environ["SM_CHANNEL_TRAINING"])

        aa("--backbone", default="resnet34", help="backbone model either resnet34 (default) or resnet50")

        aa("--num-classes", default=7, type=int, metavar="N", help="number of classes including the background")

        aa(

            "-b",

            "--batch-size",

            default=16,

            type=int,

            metavar="N",

            help="mini-batch size (default: 16), this is the total "

            "batch size of all GPUs on the current node when "

            "using Data Parallel or Distributed Data Parallel",

        )

        aa(

            "--lr",

            "--learning-rate",

            default=1e-3,

            type=float,

            metavar="LR",

            help="initial learning rate",

            dest="learning_rate",

        )

        aa("--momentum", default=0.9, type=float, metavar="M", help="momentum")

        aa(

            "--wd",

            "--weight-decay",

            default=1e-4,

            type=float,

            metavar="W",

            help="weight decay (default: 1e-4)",

            dest="weight_decay",

        )

        aa("--seed", type=int, default=123, help="seed for initializing training")

        aa("--pretrained-mfn-ckpt", type=str)

        aa("--pretrained-rpn-ckpt", type=str)

        aa("--pretrained-roi-ckpt", type=str)

        aa("--finetuned-rpn-ckpt", type=str)

        aa("--finetuned-roi-ckpt", type=str)

        aa("--resume-from-checkpoint", type=str)

        aa("--resume-sagemaker-from-checkpoint", type=str, default=os.getenv("SM_CHANNEL_PRETRAINED_CHECKPOINT", None))

        return parser

def get_args():

    parent_parser = ArgumentParser(add_help=False)

    aa = parent_parser.add_argument

    aa("--epochs", type=int, default=1, help="number of training epochs")

    aa("--save-path", metavar="DIR", default=os.environ["SM_MODEL_DIR"], type=str, help="path to save output")

    aa("--gpus", type=int, default=os.getenv("SM_NUM_GPUS", 1), help="how many gpus")

    aa(

        "--distributed-backend",

        type=str,

        default="",

        choices=("dp", "ddp", "ddp2"),

        help="supports three options dp, ddp, ddp2",

    )

    # aa("--use-16bit", dest="use_16bit", action="store_true", help="if true uses 16 bit precision")

    parser = DDNDetection.add_model_specific_args(parent_parser)

    return parser.parse_args()

def model_fn(model_dir):

    # TODO: `model_fn` doesn't get more args

    # see: https://github.com/aws/sagemaker-inference-toolkit/issues/65

    backbone = "resnet34"

    num_classes = 7  # including the background

    mfn = load_checkpoint(Classification(backbone, num_classes - 1).mfn, model_dir, "mfn")

    rpn = load_checkpoint(RPN(), model_dir, "rpn")

    roi = load_checkpoint(RoI(num_classes), model_dir, "roi")

    model = Detection(mfn, rpn, roi)

    model = model.eval()

    freeze(model)

    return model

def main(args: Namespace) -> None:

    ddn = DDNDetection(**vars(args))

    if args.seed is not None:

        pl.seed_everything(args.seed)  # doesn't do multi-gpu

        if torch.cuda.device_count() > 1:

            torch.cuda.manual_seed_all(args.seed)

    # TODO: add deterministic training

    # torch.backends.cudnn.deterministic = True

    if ddn.train_rpn:

        checkpoint_callback = ModelCheckpoint(

            filepath=os.path.join(args.save_path, "{epoch}-{loss:.3f}"),

            save_top_k=1,

            verbose=True,

            monitor="loss",

            mode="min",

        )

        early_stop_callback = None

    elif ddn.train_roi:

        checkpoint_callback = ModelCheckpoint(

            filepath=os.path.join(args.save_path, "{epoch}-{loss:.3f}"),

            save_top_k=1,

            verbose=True,

            monitor="loss",

            mode="min",

        )

        early_stop_callback = None

    else:

        checkpoint_callback = ModelCheckpoint(

            filepath=os.path.join(args.save_path, "{epoch}-{loss:.3f}-{main_score:.3f}"),

            save_top_k=1,

            verbose=True,

            monitor="main_score",

            mode="max",

        )

        early_stop_callback = EarlyStopping("main_score", patience=50, mode="max")

    trainer = pl.Trainer(

        default_root_dir=args.save_path,

        num_sanity_val_steps=1,

        limit_val_batches=1.0,

        gpus=args.gpus,

        max_epochs=args.epochs,

        early_stop_callback=early_stop_callback,

        checkpoint_callback=checkpoint_callback,

        distributed_backend=args.distributed_backend or None,

        # precision=16 if args.use_16bit else 32, # TODO: apex

        weights_summary="top",

        resume_from_checkpoint=None if args.resume_from_checkpoint == "" else args.resume_from_checkpoint,

    )

    trainer.fit(ddn)

    return

if __name__ == "__main__":

    main(get_args())

Functions

get_args

def get_args(

)

View Source

def get_args():

    parent_parser = ArgumentParser(add_help=False)

    aa = parent_parser.add_argument

    aa("--epochs", type=int, default=1, help="number of training epochs")

    aa("--save-path", metavar="DIR", default=os.environ["SM_MODEL_DIR"], type=str, help="path to save output")

    aa("--gpus", type=int, default=os.getenv("SM_NUM_GPUS", 1), help="how many gpus")

    aa(

        "--distributed-backend",

        type=str,

        default="",

        choices=("dp", "ddp", "ddp2"),

        help="supports three options dp, ddp, ddp2",

    )

    # aa("--use-16bit", dest="use_16bit", action="store_true", help="if true uses 16 bit precision")

    parser = DDNDetection.add_model_specific_args(parent_parser)

    return parser.parse_args()

main

def main(
    args: argparse.Namespace
) -> None

View Source

def main(args: Namespace) -> None:

    ddn = DDNDetection(**vars(args))

    if args.seed is not None:

        pl.seed_everything(args.seed)  # doesn't do multi-gpu

        if torch.cuda.device_count() > 1:

            torch.cuda.manual_seed_all(args.seed)

    # TODO: add deterministic training

    # torch.backends.cudnn.deterministic = True

    if ddn.train_rpn:

        checkpoint_callback = ModelCheckpoint(

            filepath=os.path.join(args.save_path, "{epoch}-{loss:.3f}"),

            save_top_k=1,

            verbose=True,

            monitor="loss",

            mode="min",

        )

        early_stop_callback = None

    elif ddn.train_roi:

        checkpoint_callback = ModelCheckpoint(

            filepath=os.path.join(args.save_path, "{epoch}-{loss:.3f}"),

            save_top_k=1,

            verbose=True,

            monitor="loss",

            mode="min",

        )

        early_stop_callback = None

    else:

        checkpoint_callback = ModelCheckpoint(

            filepath=os.path.join(args.save_path, "{epoch}-{loss:.3f}-{main_score:.3f}"),

            save_top_k=1,

            verbose=True,

            monitor="main_score",

            mode="max",

        )

        early_stop_callback = EarlyStopping("main_score", patience=50, mode="max")

    trainer = pl.Trainer(

        default_root_dir=args.save_path,

        num_sanity_val_steps=1,

        limit_val_batches=1.0,

        gpus=args.gpus,

        max_epochs=args.epochs,

        early_stop_callback=early_stop_callback,

        checkpoint_callback=checkpoint_callback,

        distributed_backend=args.distributed_backend or None,

        # precision=16 if args.use_16bit else 32, # TODO: apex

        weights_summary="top",

        resume_from_checkpoint=None if args.resume_from_checkpoint == "" else args.resume_from_checkpoint,

    )

    trainer.fit(ddn)

    return

model_fn

def model_fn(
    model_dir
)

View Source

def model_fn(model_dir):

    # TODO: `model_fn` doesn't get more args

    # see: https://github.com/aws/sagemaker-inference-toolkit/issues/65

    backbone = "resnet34"

    num_classes = 7  # including the background

    mfn = load_checkpoint(Classification(backbone, num_classes - 1).mfn, model_dir, "mfn")

    rpn = load_checkpoint(RPN(), model_dir, "rpn")

    roi = load_checkpoint(RoI(num_classes), model_dir, "roi")

    model = Detection(mfn, rpn, roi)

    model = model.eval()

    freeze(model)

    return model

Classes

DDNDetection

class DDNDetection(
    train_rpn: bool,
    train_roi: bool,
    finetune_rpn: bool,
    finetune_roi: bool,
    data_path: str,
    backbone: str,
    num_classes: int,
    learning_rate: float,
    batch_size: int,
    momentum: float,
    weight_decay: float,
    seed: int,
    pretrained_mfn_ckpt: Union[str, NoneType] = None,
    pretrained_rpn_ckpt: Union[str, NoneType] = None,
    pretrained_roi_ckpt: Union[str, NoneType] = None,
    finetuned_rpn_ckpt: Union[str, NoneType] = None,
    finetuned_roi_ckpt: Union[str, NoneType] = None,
    resume_sagemaker_from_checkpoint: Union[str, NoneType] = None,
    **kwargs
)

Ancestors (in MRO)

pytorch_lightning.core.lightning.LightningModule
abc.ABC
pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin
pytorch_lightning.core.grads.GradInformation
pytorch_lightning.core.saving.ModelIO
pytorch_lightning.core.hooks.ModelHooks
torch.nn.modules.module.Module

Class variables

CHECKPOINT_HYPER_PARAMS_KEY

CHECKPOINT_HYPER_PARAMS_NAME

CHECKPOINT_HYPER_PARAMS_TYPE

T_destination

dump_patches

Static methods

add_model_specific_args

def add_model_specific_args(
    parent_parser
)

View Source

    @staticmethod

    def add_model_specific_args(parent_parser):  # pragma: no-cover

        parser = ArgumentParser(parents=[parent_parser], add_help=False)

        aa = parser.add_argument

        aa("--train-rpn", action="store_true")

        aa("--train-roi", action="store_true")

        aa("--finetune-rpn", action="store_true")

        aa("--finetune-roi", action="store_true")

        aa("--data-path", metavar="DIR", type=str, default=os.environ["SM_CHANNEL_TRAINING"])

        aa("--backbone", default="resnet34", help="backbone model either resnet34 (default) or resnet50")

        aa("--num-classes", default=7, type=int, metavar="N", help="number of classes including the background")

        aa(

            "-b",

            "--batch-size",

            default=16,

            type=int,

            metavar="N",

            help="mini-batch size (default: 16), this is the total "

            "batch size of all GPUs on the current node when "

            "using Data Parallel or Distributed Data Parallel",

        )

        aa(

            "--lr",

            "--learning-rate",

            default=1e-3,

            type=float,

            metavar="LR",

            help="initial learning rate",

            dest="learning_rate",

        )

        aa("--momentum", default=0.9, type=float, metavar="M", help="momentum")

        aa(

            "--wd",

            "--weight-decay",

            default=1e-4,

            type=float,

            metavar="W",

            help="weight decay (default: 1e-4)",

            dest="weight_decay",

        )

        aa("--seed", type=int, default=123, help="seed for initializing training")

        aa("--pretrained-mfn-ckpt", type=str)

        aa("--pretrained-rpn-ckpt", type=str)

        aa("--pretrained-roi-ckpt", type=str)

        aa("--finetuned-rpn-ckpt", type=str)

        aa("--finetuned-roi-ckpt", type=str)

        aa("--resume-from-checkpoint", type=str)

        aa("--resume-sagemaker-from-checkpoint", type=str, default=os.getenv("SM_CHANNEL_PRETRAINED_CHECKPOINT", None))

        return parser

load_from_checkpoint

def load_from_checkpoint(
    checkpoint_path: str,
    *args,
    map_location: Union[Dict[str, str], str, torch.device, int, Callable, NoneType] = None,
    hparams_file: Union[str, NoneType] = None,
    tags_csv: Union[str, NoneType] = None,
    **kwargs
)

Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint

it stores the arguments passed to __init__ in the checkpoint under module_arguments

Any arguments specified through *args and **kwargs will override args stored in hparams.

Parameters:

Name	Type	Description	Default
checkpoint_path	None	Path to checkpoint. This can also be a URL.	None
args	None	Any positional args needed to init the model.	None
map_location	None	If your checkpoint saved a GPU model and you now load on CPUs
or a different number of GPUs, use this to map to the new setup.
The behaviour is the same as in :func:`torch.load`.	None
hparams_file	None	Optional path to a .yaml file with hierarchical structure
as in this example::

drop_prob: 0.2
dataloader:
    batch_size: 32

You most likely won't need this since Lightning will always save the hyperparameters to the checkpoint. However, if your checkpoint weights don't have the hyperparameters saved, use this method to pass in a .yaml file with the hparams you'd like to use. These will be converted into a :class:~dict and passed into your :class:LightningModule for use.

If your model's hparams argument is :class:~argparse.Namespace and .yaml file has hierarchical structure, you need to refactor your model to treat hparams as :class:~dict.

`tags_csv` argument is deprecated in v0.7.6. Will be removed v0.9.0.

Optional path to a .csv file with two columns (key, value) as in this example::

key,value
drop_prob,0.2
batch_size,32

View Source

    @classmethod

    def load_from_checkpoint(

            cls,

            checkpoint_path: str,

            *args,

            map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None,

            hparams_file: Optional[str] = None,

            tags_csv: Optional[str] = None,  # backward compatible, todo: remove in v0.9.0

            **kwargs

    ):

        r"""

        Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint

        it stores the arguments passed to `__init__`  in the checkpoint under `module_arguments`

        Any arguments specified through \*args and \*\*kwargs will override args stored in `hparams`.

        Args:

            checkpoint_path: Path to checkpoint. This can also be a URL.

            args: Any positional args needed to init the model.

            map_location:

                If your checkpoint saved a GPU model and you now load on CPUs

                or a different number of GPUs, use this to map to the new setup.

                The behaviour is the same as in :func:`torch.load`.

            hparams_file: Optional path to a .yaml file with hierarchical structure

                as in this example::

                    drop_prob: 0.2

                    dataloader:

                        batch_size: 32

                You most likely won't need this since Lightning will always save the hyperparameters

                to the checkpoint.

                However, if your checkpoint weights don't have the hyperparameters saved,

                use this method to pass in a .yaml file with the hparams you'd like to use.

                These will be converted into a :class:`~dict` and passed into your

                :class:`LightningModule` for use.

                If your model's `hparams` argument is :class:`~argparse.Namespace`

                and .yaml file has hierarchical structure, you need to refactor your model to treat

                `hparams` as :class:`~dict`.

                .csv files are acceptable here till v0.9.0, see tags_csv argument for detailed usage.

            tags_csv:

                .. warning:: .. deprecated:: 0.7.6

                    `tags_csv` argument is deprecated in v0.7.6. Will be removed v0.9.0.

                Optional path to a .csv file with two columns (key, value)

                as in this example::

                    key,value

                    drop_prob,0.2

                    batch_size,32

                Use this method to pass in a .csv file with the hparams you'd like to use.

            hparam_overrides: A dictionary with keys to override in the hparams

            kwargs: Any keyword args needed to init the model.

        Return:

            :class:`LightningModule` with loaded weights and hyperparameters (if available).

        Example:

            .. code-block:: python

                # load weights without mapping ...

                MyLightningModule.load_from_checkpoint('path/to/checkpoint.ckpt')

                # or load weights mapping all weights from GPU 1 to GPU 0 ...

                map_location = {'cuda:1':'cuda:0'}

                MyLightningModule.load_from_checkpoint(

                    'path/to/checkpoint.ckpt',

                    map_location=map_location

                )

                # or load weights and hyperparameters from separate files.

                MyLightningModule.load_from_checkpoint(

                    'path/to/checkpoint.ckpt',

                    hparams_file='/path/to/hparams_file.yaml'

                )

                # override some of the params with new values

                MyLightningModule.load_from_checkpoint(

                    PATH,

                    num_layers=128,

                    pretrained_ckpt_path: NEW_PATH,

                )

                # predict

                pretrained_model.eval()

                pretrained_model.freeze()

                y_hat = pretrained_model(x)

        """

        if map_location is not None:

            checkpoint = pl_load(checkpoint_path, map_location=map_location)

        else:

            checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)

        # add the hparams from csv file to checkpoint

        if tags_csv is not None:

            hparams_file = tags_csv

            rank_zero_warn('`tags_csv` argument is deprecated in v0.7.6. Will be removed v0.9.0', DeprecationWarning)

        if hparams_file is not None:

            extension = hparams_file.split('.')[-1]

            if extension.lower() in ('csv'):

                hparams = load_hparams_from_tags_csv(hparams_file)

            elif extension.lower() in ('yml', 'yaml'):

                hparams = load_hparams_from_yaml(hparams_file)

            else:

                raise ValueError('.csv, .yml or .yaml is required for `hparams_file`')

            hparams['on_gpu'] = False

            # overwrite hparams by the given file

            checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] = hparams

        # for past checkpoint need to add the new key

        if cls.CHECKPOINT_HYPER_PARAMS_KEY not in checkpoint:

            checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] = {}

        # override the hparams with values that were passed in

        checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY].update(kwargs)

        model = cls._load_model_state(checkpoint, *args, **kwargs)

        return model

load_from_metrics

def load_from_metrics(
    weights_path,
    tags_csv,
    map_location=None
)

Warning:

Deprecated in version 0.7.0. You should use :meth:load_from_checkpoint instead. Will be removed in v0.9.0.

View Source

    @classmethod

    def load_from_metrics(cls, weights_path, tags_csv, map_location=None):

        r"""

        Warning:

            Deprecated in version 0.7.0. You should use :meth:`load_from_checkpoint` instead.

            Will be removed in v0.9.0.

        """

        rank_zero_warn(

            "`load_from_metrics` method has been unified with `load_from_checkpoint` in v0.7.0."

            " The deprecated method will be removed in v0.9.0.", DeprecationWarning

        )

        return cls.load_from_checkpoint(weights_path, tags_csv=tags_csv, map_location=map_location)

Instance variables

device

dtype

example_input_array

hparams

on_gpu

True if your model is currently running on GPUs.

Useful to set flags around the LightningModule for different CPU vs GPU behavior.

Methods

add_module

def add_module(
    self,
    name: str,
    module: 'Module'
) -> None

Adds a child module to the current module.

The module can be accessed as an attribute using the given name.

Parameters:

Name	Type	Description	Default
name	string	name of the child module. The child module can be
accessed from this module using the given name	None
module	Module	child module to be added to the module.	None

View Source

    def add_module(self, name: str, module: 'Module') -> None:

        r"""Adds a child module to the current module.

        The module can be accessed as an attribute using the given name.

        Args:

            name (string): name of the child module. The child module can be

                accessed from this module using the given name

            module (Module): child module to be added to the module.

        """

        if not isinstance(module, Module) and module is not None:

            raise TypeError("{} is not a Module subclass".format(

                torch.typename(module)))

        elif not isinstance(name, torch._six.string_classes):

            raise TypeError("module name should be a string. Got {}".format(

                torch.typename(name)))

        elif hasattr(self, name) and name not in self._modules:

            raise KeyError("attribute '{}' already exists".format(name))

        elif '.' in name:

            raise KeyError("module name can't contain \".\"")

        elif name == '':

            raise KeyError("module name can't be empty string \"\"")

        self._modules[name] = module

amp_scale_loss

def amp_scale_loss(
    self,
    unscaled_loss,
    optimizer,
    optimizer_idx
)

View Source

    def amp_scale_loss(self, unscaled_loss, optimizer, optimizer_idx):

        if NATIVE_AMP_AVALAIBLE:

            scaled_loss = self.trainer.scaler.scale(unscaled_loss)

        else:

            scaled_loss = amp.scale_loss(unscaled_loss, optimizer)

        return scaled_loss

apply

def apply(
    self: ~T,
    fn: Callable[[ForwardRef('Module')], NoneType]
) -> ~T

Applies fn recursively to every submodule (as returned by .children())

as well as self. Typical use includes initializing the parameters of a model (see also :ref:nn-init-doc).

Parameters:

Name	Type	Description	Default
fn (	None	class:`Module` -> None): function to be applied to each submodule	None

Returns:

Type	Description
Module	self
Example::

>>> @torch.no_grad()
>>> def init_weights(m):
>>>     print(m)
>>>     if type(m) == nn.Linear:
>>>         m.weight.fill_(1.0)
>>>         print(m.weight)
>>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
>>> net.apply(init_weights)
Linear(in_features=2, out_features=2, bias=True)
Parameter containing:
tensor([[ 1.,  1.],
        [ 1.,  1.]])
Linear(in_features=2, out_features=2, bias=True)
Parameter containing:
tensor([[ 1.,  1.],
        [ 1.,  1.]])
Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)
Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
) |

View Source

    def apply(self: T, fn: Callable[['Module'], None]) -> T:

        r"""Applies ``fn`` recursively to every submodule (as returned by ``.children()``)

        as well as self. Typical use includes initializing the parameters of a model

        (see also :ref:`nn-init-doc`).

        Args:

            fn (:class:`Module` -> None): function to be applied to each submodule

        Returns:

            Module: self

        Example::

            >>> @torch.no_grad()

            >>> def init_weights(m):

            >>>     print(m)

            >>>     if type(m) == nn.Linear:

            >>>         m.weight.fill_(1.0)

            >>>         print(m.weight)

            >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))

            >>> net.apply(init_weights)

            Linear(in_features=2, out_features=2, bias=True)

            Parameter containing:

            tensor([[ 1.,  1.],

                    [ 1.,  1.]])

            Linear(in_features=2, out_features=2, bias=True)

            Parameter containing:

            tensor([[ 1.,  1.],

                    [ 1.,  1.]])

            Sequential(

              (0): Linear(in_features=2, out_features=2, bias=True)

              (1): Linear(in_features=2, out_features=2, bias=True)

            )

            Sequential(

              (0): Linear(in_features=2, out_features=2, bias=True)

              (1): Linear(in_features=2, out_features=2, bias=True)

            )

        """

        for module in self.children():

            module.apply(fn)

        fn(self)

        return self

backward

def backward(
    self,
    trainer,
    loss: torch.Tensor,
    optimizer: torch.optim.optimizer.Optimizer,
    optimizer_idx: int
) -> None

Override backward with your own implementation if you need to.

Parameters:

Name	Type	Description	Default
trainer	None	Pointer to the trainer	None
loss	None	Loss is already scaled by accumulated grads	None
optimizer	None	Current optimizer being used	None
optimizer_idx	None	Index of the current optimizer being used
Called to perform backward step.
Feel free to override as needed.

The loss passed in has already been scaled for accumulated gradients if requested.

View Source

    def backward(self, trainer, loss: Tensor, optimizer: Optimizer, optimizer_idx: int) -> None:

        """

        Override backward with your own implementation if you need to.

        Args:

            trainer: Pointer to the trainer

            loss: Loss is already scaled by accumulated grads

            optimizer: Current optimizer being used

            optimizer_idx: Index of the current optimizer being used

        Called to perform backward step.

        Feel free to override as needed.

        The loss passed in has already been scaled for accumulated gradients if requested.

        Example::

            def backward(self, trainer, loss, optimizer, optimizer_idx):

                loss.backward()

        """

        loss.backward()

bfloat16

def bfloat16(
    self: ~T
) -> ~T

Casts all floating point parameters and buffers to bfloat16 datatype.

Returns:

Type	Description
Module	self

View Source

    def bfloat16(self: T) -> T:

        r"""Casts all floating point parameters and buffers to ``bfloat16`` datatype.

        Returns:

            Module: self

        """

        return self._apply(lambda t: t.bfloat16() if t.is_floating_point() else t)

buffers

def buffers(
    self,
    recurse: bool = True
) -> Iterator[torch.Tensor]

Returns an iterator over module buffers.

Parameters:

Name	Type	Description
recurse	bool	if True, then yields buffers of this module
and all submodules. Otherwise, yields only buffers that
are direct members of this module.	None

Yields:

Type	Description
torch.Tensor	module buffer
Example::

>>> for buf in model.buffers():
>>>     print(type(buf), buf.size())
<class 'torch.Tensor'> (20L,)
<class 'torch.Tensor'> (20L, 1L, 5L, 5L) |

View Source

    def buffers(self, recurse: bool = True) -> Iterator[Tensor]:

        r"""Returns an iterator over module buffers.

        Args:

            recurse (bool): if True, then yields buffers of this module

                and all submodules. Otherwise, yields only buffers that

                are direct members of this module.

        Yields:

            torch.Tensor: module buffer

        Example::

            >>> for buf in model.buffers():

            >>>     print(type(buf), buf.size())

            <class 'torch.Tensor'> (20L,)

            <class 'torch.Tensor'> (20L, 1L, 5L, 5L)

        """

        for name, buf in self.named_buffers(recurse=recurse):

            yield buf

children

def children(
    self
) -> Iterator[ForwardRef('Module')]

Returns an iterator over immediate children modules.

Yields:

Type	Description
Module	a child module

View Source

    def children(self) -> Iterator['Module']:

        r"""Returns an iterator over immediate children modules.

        Yields:

            Module: a child module

        """

        for name, module in self.named_children():

            yield module

configure_apex

def configure_apex(
    self,
    amp: object,
    model: 'LightningModule',
    optimizers: List[torch.optim.optimizer.Optimizer],
    amp_level: str
) -> Tuple[ForwardRef('LightningModule'), List[torch.optim.optimizer.Optimizer]]

Override to init AMP your own way. Must return a model and list of optimizers.

Args: amp: pointer to amp library object. model: pointer to current :class:LightningModule. optimizers: list of optimizers passed in :meth:configure_optimizers. amp_level: AMP mode chosen ('O1', 'O2', etc...)

Return: Apex wrapped model and optimizers

Examples: .. code-block:: python

    # Default implementation used by Trainer.
    def configure_apex(self, amp, model, optimizers, amp_level):
        model, optimizers = amp.initialize(
            model, optimizers, opt_level=amp_level,
        )

        return model, optimizers

View Source

    def configure_apex(

            self,

            amp: object,

            model: 'LightningModule',

            optimizers: List[Optimizer],

            amp_level: str

    ) -> Tuple['LightningModule', List[Optimizer]]:

        r"""

        Override to init AMP your own way.

        Must return a model and list of optimizers.

        Args:

            amp: pointer to amp library object.

            model: pointer to current :class:`LightningModule`.

            optimizers: list of optimizers passed in :meth:`configure_optimizers`.

            amp_level: AMP mode chosen ('O1', 'O2', etc...)

        Return:

            Apex wrapped model and optimizers

        Examples:

            .. code-block:: python

                # Default implementation used by Trainer.

                def configure_apex(self, amp, model, optimizers, amp_level):

                    model, optimizers = amp.initialize(

                        model, optimizers, opt_level=amp_level,

                    )

                    return model, optimizers

        """

        model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level)

        return model, optimizers

configure_ddp

def configure_ddp(
    self,
    model: 'LightningModule',
    device_ids: List[int]
) -> torch.nn.parallel.distributed.DistributedDataParallel

Override to init DDP in your own way or with your own wrapper. The only requirements are that:

On a validation batch the call goes to model.validation_step.
On a training batch the call goes to model.training_step.
On a testing batch, the call goes to model.test_step.+

Args: model: the :class:LightningModule currently being optimized. device_ids: the list of GPU ids.

Return: DDP wrapped model

Examples: .. code-block:: python

    # default implementation used in Trainer
    def configure_ddp(self, model, device_ids):
        # Lightning DDP simply routes to test_step, val_step, etc...
        model = LightningDistributedDataParallel(
            model,
            device_ids=device_ids,
            find_unused_parameters=True
        )
        return model

View Source

    def configure_ddp(

            self,

            model: 'LightningModule',

            device_ids: List[int]

    ) -> DistributedDataParallel:

        r"""

        Override to init DDP in your own way or with your own wrapper.

        The only requirements are that:

        1. On a validation batch the call goes to ``model.validation_step``.

        2. On a training batch the call goes to ``model.training_step``.

        3. On a testing batch, the call goes to ``model.test_step``.+

        Args:

            model: the :class:`LightningModule` currently being optimized.

            device_ids: the list of GPU ids.

        Return:

            DDP wrapped model

        Examples:

            .. code-block:: python

                # default implementation used in Trainer

                def configure_ddp(self, model, device_ids):

                    # Lightning DDP simply routes to test_step, val_step, etc...

                    model = LightningDistributedDataParallel(

                        model,

                        device_ids=device_ids,

                        find_unused_parameters=True

                    )

                    return model

        """

        model = LightningDistributedDataParallel(

            model,

            device_ids=device_ids,

            find_unused_parameters=True

        )

        return model

configure_optimizers

def configure_optimizers(
    self
)

View Source

    def configure_optimizers(self):

        params = [p for p in self.parameters() if p.requires_grad]

        optimizer = optim.SGD(params, lr=self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay)

        return optimizer

cpu

def cpu(
    self
) -> torch.nn.modules.module.Module

Moves all model parameters and buffers to the CPU.

Returns:

Type	Description
Module	self

View Source

    def cpu(self) -> Module:

        """Moves all model parameters and buffers to the CPU.

        Returns:

            Module: self

        """

        self._device = torch.device('cpu')

        return super().cpu()

cuda

def cuda(
    self,
    device: Union[int, NoneType] = None
) -> torch.nn.modules.module.Module

Moves all model parameters and buffers to the GPU.

This also makes associated parameters and buffers different objects. So it should be called before constructing optimizer if the module will live on GPU while being optimized.

Parameters:

Name	Type	Description	Default
device	None	if specified, all parameters will be
copied to that device	None

Returns:

Type	Description
Module	self

View Source

    def cuda(self, device: Optional[int] = None) -> Module:

        """Moves all model parameters and buffers to the GPU.

        This also makes associated parameters and buffers different objects. So

        it should be called before constructing optimizer if the module will

        live on GPU while being optimized.

        Arguments:

            device: if specified, all parameters will be

                copied to that device

        Returns:

            Module: self

        """

        self._device = torch.device('cuda', index=device)

        return super().cuda(device=device)

double

def double(
    self
) -> torch.nn.modules.module.Module

Casts all floating point parameters and buffers to double datatype.

Returns:

Type	Description
Module	self

View Source

    def double(self) -> Module:

        """Casts all floating point parameters and buffers to ``double`` datatype.

        Returns:

            Module: self

        """

        self._dtype = torch.double

        return super().double()

eval

def eval(
    self: ~T
) -> ~T

Sets the module in evaluation mode.

This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:Dropout, :class:BatchNorm, etc.

This is equivalent with :meth:self.train(False) <torch.nn.Module.train>.

Returns:

Type	Description
Module	self

View Source

    def eval(self: T) -> T:

        r"""Sets the module in evaluation mode.

        This has any effect only on certain modules. See documentations of

        particular modules for details of their behaviors in training/evaluation

        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,

        etc.

        This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.

        Returns:

            Module: self

        """

        return self.train(False)

extra_repr

def extra_repr(
    self
) -> str

Set the extra representation of the module

To print customized extra information, you should reimplement this method in your own modules. Both single-line and multi-line strings are acceptable.

View Source

    def extra_repr(self) -> str:

        r"""Set the extra representation of the module

        To print customized extra information, you should reimplement

        this method in your own modules. Both single-line and multi-line

        strings are acceptable.

        """

        return ''

float

def float(
    self
) -> torch.nn.modules.module.Module

Casts all floating point parameters and buffers to float datatype.

Returns:

Type	Description
Module	self

View Source

    def float(self) -> Module:

        """Casts all floating point parameters and buffers to float datatype.

        Returns:

            Module: self

        """

        self._dtype = torch.float

        return super().float()

forward

def forward(
    self,
    images,
    *args,
    **kwargs
)

View Source

    @auto_move_data

    def forward(self, images, *args, **kwargs):

        if self.train_rpn:  # step 2

            images = torch.stack(images)

            features = self.mfn(images)

            features = OrderedDict({str(i): t.unsqueeze(0) for i, t in enumerate(features)})

            images = ImageList(images, [(224, 224)])

            return self.rpn(images, features, targets=kwargs.get("targets"))

        elif self.train_roi:  # step 3

            self.mfn.eval()

            self.rpn.eval()

            images = torch.stack(images)

            features = self.mfn(images)

            features = OrderedDict({str(i): t.unsqueeze(0) for i, t in enumerate(features)})

            images = ImageList(images, [(224, 224)])

            proposals, _ = self.rpn(images, features, targets=None)

            return self.roi(features, proposals, [(224, 224)], targets=kwargs.get("targets"))

        elif self.finetune_rpn:

            self.model.backbone.eval()

            self.model.roi_heads.eval()

            return self.model(images, targets=kwargs.get("targets"))

        elif self.finetune_roi:

            self.model.backbone.eval()

            self.model.rpn.eval()

            return self.model(images, targets=kwargs.get("targets"))

        else:

            return self.model(images, targets=kwargs.get("targets"))

freeze

def freeze(
    self
) -> None

Freeze all params for inference.

View Source

    def freeze(self) -> None:

        r"""

        Freeze all params for inference.

        Example:

            .. code-block:: python

                model = MyLightningModule(...)

                model.freeze()

        """

        for param in self.parameters():

            param.requires_grad = False

        self.eval()

get_progress_bar_dict

def get_progress_bar_dict(
    self
) -> Dict[str, Union[int, str]]

Additional items to be displayed in the progress bar.

Return: Dictionary with the items to be displayed in the progress bar.

View Source

    def get_progress_bar_dict(self) -> Dict[str, Union[int, str]]:

        r"""

        Additional items to be displayed in the progress bar.

        Return:

            Dictionary with the items to be displayed in the progress bar.

        """

        # call .item() only once but store elements without graphs

        running_train_loss = self.trainer.running_loss.mean()

        avg_training_loss = running_train_loss.cpu().item() if running_train_loss is not None else float('NaN')

        tqdm_dict = {

            'loss': '{:.3f}'.format(avg_training_loss)

        }

        if self.trainer.truncated_bptt_steps is not None:

            tqdm_dict['split_idx'] = self.trainer.split_idx

        if self.trainer.logger is not None and self.trainer.logger.version is not None:

            tqdm_dict['v_num'] = self.trainer.logger.version

        return tqdm_dict

get_tqdm_dict

def get_tqdm_dict(
    self
) -> Dict[str, Union[int, str]]

Additional items to be displayed in the progress bar.

Return: Dictionary with the items to be displayed in the progress bar.

Warning: Deprecated since v0.7.3. Use :meth:get_progress_bar_dict instead.

View Source

    def get_tqdm_dict(self) -> Dict[str, Union[int, str]]:

        """

        Additional items to be displayed in the progress bar.

        Return:

            Dictionary with the items to be displayed in the progress bar.

        Warning:

            Deprecated since v0.7.3.

            Use :meth:`get_progress_bar_dict` instead.

        """

        rank_zero_warn("`get_tqdm_dict` was renamed to `get_progress_bar_dict` in v0.7.3"

                       " and this method will be removed in v1.0.0", DeprecationWarning)

        return self.get_progress_bar_dict()

grad_norm

def grad_norm(
    self,
    norm_type: Union[float, int, str]
) -> Dict[str, float]

Compute each parameter's gradient's norm and their overall norm.

The overall norm is computed over all gradients together, as if they were concatenated into a single vector.

Parameters:

Name	Type	Description	Default
norm_type	None	The type of the used p-norm, cast to float if necessary.
Can be `'inf'` for infinity norm.

View Source

    def grad_norm(self, norm_type: Union[float, int, str]) -> Dict[str, float]:

        """Compute each parameter's gradient's norm and their overall norm.

        The overall norm is computed over all gradients together, as if they

        were concatenated into a single vector.

        Args:

            norm_type: The type of the used p-norm, cast to float if necessary.

                Can be ``'inf'`` for infinity norm.

        Return:

            norms: The dictionary of p-norms of each parameter's gradient and

                a special entry for the total p-norm of the gradients viewed

                as a single vector.

        """

        norm_type = float(norm_type)

        norms, all_norms = {}, []

        for name, p in self.named_parameters():

            if p.grad is None:

                continue

            param_norm = float(p.grad.data.norm(norm_type))

            norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 3)

            all_norms.append(param_norm)

        total_norm = float(torch.tensor(all_norms).norm(norm_type))

        norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 3)

        return norms

half

def half(
    self
) -> torch.nn.modules.module.Module

Casts all floating point parameters and buffers to half datatype.

Returns:

Type	Description
Module	self

View Source

    def half(self) -> Module:

        """Casts all floating point parameters and buffers to ``half`` datatype.

        Returns:

            Module: self

        """

        self._dtype = torch.half

        return super().half()

init_ddp_connection

def init_ddp_connection(
    self,
    global_rank: int,
    world_size: int,
    is_slurm_managing_tasks: bool = True
) -> None

Override to define your custom way of setting up a distributed environment.

Lightning's implementation uses env:// init by default and sets the first node as root for SLURM managed cluster.

Parameters:

Name	Type	Description	Default
global_rank	None	The global process idx.	None
world_size	None	Number of GPUs being use across all nodes. (num_nodes * num_gpus).	None
is_slurm_managing_tasks	None	is cluster managed by SLURM.	None

View Source

    def init_ddp_connection(

            self,

            global_rank: int,

            world_size: int,

            is_slurm_managing_tasks: bool = True

    ) -> None:

        """

        Override to define your custom way of setting up a distributed environment.

        Lightning's implementation uses env:// init by default and sets the first node as root

        for SLURM managed cluster.

        Args:

            global_rank: The global process idx.

            world_size: Number of GPUs being use across all nodes. (num_nodes * num_gpus).

            is_slurm_managing_tasks: is cluster managed by SLURM.

        """

        if is_slurm_managing_tasks:

            self._init_slurm_connection()

        if 'MASTER_ADDR' not in os.environ:

            rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost")

            os.environ['MASTER_ADDR'] = '127.0.0.1'

        log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")

        if 'MASTER_PORT' not in os.environ:

            rank_zero_warn("MASTER_PORT environment variable is not defined. Set as 12910")

            os.environ['MASTER_PORT'] = '12910'

        log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")

        if 'WORLD_SIZE' in os.environ and int(os.environ['WORLD_SIZE']) != world_size:

            rank_zero_warn(f"WORLD_SIZE environment variable ({os.environ['WORLD_SIZE']}) "

                           f"is not equal to the computed world size ({world_size}). Ignored.")

        torch_backend = "nccl" if self.trainer.on_gpu else "gloo"

        log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}")

        torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)

load_state_dict

def load_state_dict(
    self,
    state_dict: Dict[str, torch.Tensor],
    strict: bool = True
)

Copies parameters and buffers from :attr:state_dict into

this module and its descendants. If :attr:strict is True, then the keys of :attr:state_dict must exactly match the keys returned by this module's :meth:~torch.nn.Module.state_dict function.

Parameters:

Name	Type	Description
state_dict	dict	a dict containing parameters and
persistent buffers.	None
strict	bool	whether to strictly enforce that the keys
in :attr:`state_dict` match the keys returned by this module's
:meth:`~torch.nn.Module.state_dict` function. Default: `True`	None

Returns:

Type	Description
None	`NamedTuple` with `missing_keys` and `unexpected_keys` fields:
* missing_keys is a list of str containing the missing keys
* unexpected_keys is a list of str containing the unexpected keys

View Source

    def load_state_dict(self, state_dict: Union[Dict[str, Tensor], Dict[str, Tensor]],

                        strict: bool = True):

        r"""Copies parameters and buffers from :attr:`state_dict` into

        this module and its descendants. If :attr:`strict` is ``True``, then

        the keys of :attr:`state_dict` must exactly match the keys returned

        by this module's :meth:`~torch.nn.Module.state_dict` function.

        Arguments:

            state_dict (dict): a dict containing parameters and

                persistent buffers.

            strict (bool, optional): whether to strictly enforce that the keys

                in :attr:`state_dict` match the keys returned by this module's

                :meth:`~torch.nn.Module.state_dict` function. Default: ``True``

        Returns:

            ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:

                * **missing_keys** is a list of str containing the missing keys

                * **unexpected_keys** is a list of str containing the unexpected keys

        """

        missing_keys = []

        unexpected_keys = []

        error_msgs = []

        # copy state_dict so _load_from_state_dict can modify it

        metadata = getattr(state_dict, '_metadata', None)

        state_dict = state_dict.copy()

        if metadata is not None:

            state_dict._metadata = metadata

        def load(module, prefix=''):

            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})

            module._load_from_state_dict(

                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)

            for name, child in module._modules.items():

                if child is not None:

                    load(child, prefix + name + '.')

        load(self)

        load = None  # break load->load reference cycle

        if strict:

            if len(unexpected_keys) > 0:

                error_msgs.insert(

                    0, 'Unexpected key(s) in state_dict: {}. '.format(

                        ', '.join('"{}"'.format(k) for k in unexpected_keys)))

            if len(missing_keys) > 0:

                error_msgs.insert(

                    0, 'Missing key(s) in state_dict: {}. '.format(

                        ', '.join('"{}"'.format(k) for k in missing_keys)))

        if len(error_msgs) > 0:

            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(

                               self.__class__.__name__, "\n\t".join(error_msgs)))

        return _IncompatibleKeys(missing_keys, unexpected_keys)

modules

def modules(
    self
) -> Iterator[ForwardRef('Module')]

Returns an iterator over all modules in the network.

Yields:

Type	Description
Module	a module in the network
Note:
Duplicate modules are returned only once. In the following
example, `l` will be returned only once.

Example::

>>> l = nn.Linear(2, 2)
>>> net = nn.Sequential(l, l)
>>> for idx, m in enumerate(net.modules()):
        print(idx, '->', m)

0 -> Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)
1 -> Linear(in_features=2, out_features=2, bias=True) |

View Source

    def modules(self) -> Iterator['Module']:

        r"""Returns an iterator over all modules in the network.

        Yields:

            Module: a module in the network

        Note:

            Duplicate modules are returned only once. In the following

            example, ``l`` will be returned only once.

        Example::

            >>> l = nn.Linear(2, 2)

            >>> net = nn.Sequential(l, l)

            >>> for idx, m in enumerate(net.modules()):

                    print(idx, '->', m)

            0 -> Sequential(

              (0): Linear(in_features=2, out_features=2, bias=True)

              (1): Linear(in_features=2, out_features=2, bias=True)

            )

            1 -> Linear(in_features=2, out_features=2, bias=True)

        """

        for name, module in self.named_modules():

            yield module

named_buffers

def named_buffers(
    self,
    prefix: str = '',
    recurse: bool = True
) -> Iterator[Tuple[str, torch.Tensor]]

Returns an iterator over module buffers, yielding both the

name of the buffer as well as the buffer itself.

Parameters:

Name	Type	Description	Default
prefix	str	prefix to prepend to all buffer names.	None
recurse	bool	if True, then yields buffers of this module
and all submodules. Otherwise, yields only buffers that
are direct members of this module.	None

Yields:

Type	Description
None	(string, torch.Tensor): Tuple containing the name and buffer

Example::

>>> for name, buf in self.named_buffers():
>>>    if name in ['running_var']:
>>>        print(buf.size()) |

View Source

    def named_buffers(self, prefix: str = '', recurse: bool = True) -> Iterator[Tuple[str, Tensor]]:

        r"""Returns an iterator over module buffers, yielding both the

        name of the buffer as well as the buffer itself.

        Args:

            prefix (str): prefix to prepend to all buffer names.

            recurse (bool): if True, then yields buffers of this module

                and all submodules. Otherwise, yields only buffers that

                are direct members of this module.

        Yields:

            (string, torch.Tensor): Tuple containing the name and buffer

        Example::

            >>> for name, buf in self.named_buffers():

            >>>    if name in ['running_var']:

            >>>        print(buf.size())

        """

        gen = self._named_members(

            lambda module: module._buffers.items(),

            prefix=prefix, recurse=recurse)

        for elem in gen:

            yield elem

named_children

def named_children(
    self
) -> Iterator[Tuple[str, ForwardRef('Module')]]

Returns an iterator over immediate children modules, yielding both

the name of the module as well as the module itself.

Yields:

Type	Description
None	(string, Module): Tuple containing a name and child module

Example::

>>> for name, module in model.named_children():
>>>     if name in ['conv4', 'conv5']:
>>>         print(module) |

View Source

    def named_children(self) -> Iterator[Tuple[str, 'Module']]:

        r"""Returns an iterator over immediate children modules, yielding both

        the name of the module as well as the module itself.

        Yields:

            (string, Module): Tuple containing a name and child module

        Example::

            >>> for name, module in model.named_children():

            >>>     if name in ['conv4', 'conv5']:

            >>>         print(module)

        """

        memo = set()

        for name, module in self._modules.items():

            if module is not None and module not in memo:

                memo.add(module)

                yield name, module

named_modules

def named_modules(
    self,
    memo: Union[Set[ForwardRef('Module')], NoneType] = None,
    prefix: str = ''
)

Returns an iterator over all modules in the network, yielding

both the name of the module as well as the module itself.

Yields:

Type	Description
None	(string, Module): Tuple of name and module

Note: Duplicate modules are returned only once. In the following example, l will be returned only once.

Example::

>>> l = nn.Linear(2, 2)
>>> net = nn.Sequential(l, l)
>>> for idx, m in enumerate(net.named_modules()):
        print(idx, '->', m)

0 -> ('', Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
))
1 -> ('0', Linear(in_features=2, out_features=2, bias=True)) |

View Source

    def named_modules(self, memo: Optional[Set['Module']] = None, prefix: str = ''):

        r"""Returns an iterator over all modules in the network, yielding

        both the name of the module as well as the module itself.

        Yields:

            (string, Module): Tuple of name and module

        Note:

            Duplicate modules are returned only once. In the following

            example, ``l`` will be returned only once.

        Example::

            >>> l = nn.Linear(2, 2)

            >>> net = nn.Sequential(l, l)

            >>> for idx, m in enumerate(net.named_modules()):

                    print(idx, '->', m)

            0 -> ('', Sequential(

              (0): Linear(in_features=2, out_features=2, bias=True)

              (1): Linear(in_features=2, out_features=2, bias=True)

            ))

            1 -> ('0', Linear(in_features=2, out_features=2, bias=True))

        """

        if memo is None:

            memo = set()

        if self not in memo:

            memo.add(self)

            yield prefix, self

            for name, module in self._modules.items():

                if module is None:

                    continue

                submodule_prefix = prefix + ('.' if prefix else '') + name

                for m in module.named_modules(memo, submodule_prefix):

                    yield m

named_parameters

def named_parameters(
    self,
    prefix: str = '',
    recurse: bool = True
) -> Iterator[Tuple[str, torch.Tensor]]

Returns an iterator over module parameters, yielding both the

name of the parameter as well as the parameter itself.

Parameters:

Name	Type	Description	Default
prefix	str	prefix to prepend to all parameter names.	None
recurse	bool	if True, then yields parameters of this module
and all submodules. Otherwise, yields only parameters that
are direct members of this module.	None

Yields:

Type	Description
None	(string, Parameter): Tuple containing the name and parameter

Example::

>>> for name, param in self.named_parameters():
>>>    if name in ['bias']:
>>>        print(param.size()) |

View Source

    def named_parameters(self, prefix: str = '', recurse: bool = True) -> Iterator[Tuple[str, Tensor]]:

        r"""Returns an iterator over module parameters, yielding both the

        name of the parameter as well as the parameter itself.

        Args:

            prefix (str): prefix to prepend to all parameter names.

            recurse (bool): if True, then yields parameters of this module

                and all submodules. Otherwise, yields only parameters that

                are direct members of this module.

        Yields:

            (string, Parameter): Tuple containing the name and parameter

        Example::

            >>> for name, param in self.named_parameters():

            >>>    if name in ['bias']:

            >>>        print(param.size())

        """

        gen = self._named_members(

            lambda module: module._parameters.items(),

            prefix=prefix, recurse=recurse)

        for elem in gen:

            yield elem

on_after_backward

def on_after_backward(
    self
) -> None

Called in the training loop after loss.backward() and before optimizers do anything.

This is the ideal place to inspect or log gradient information.

Example::

def on_after_backward(self):
    # example to inspect gradient information in tensorboard
    if self.trainer.global_step % 25 == 0:  # don't make the tf file huge
        params = self.state_dict()
        for k, v in params.items():
            grads = v
            name = k
            self.logger.experiment.add_histogram(tag=name, values=grads,
                                                 global_step=self.trainer.global_step)

View Source

    def on_after_backward(self) -> None:

        """

        Called in the training loop after loss.backward() and before optimizers do anything.

        This is the ideal place to inspect or log gradient information.

        Example::

            def on_after_backward(self):

                # example to inspect gradient information in tensorboard

                if self.trainer.global_step % 25 == 0:  # don't make the tf file huge

                    params = self.state_dict()

                    for k, v in params.items():

                        grads = v

                        name = k

                        self.logger.experiment.add_histogram(tag=name, values=grads,

                                                             global_step=self.trainer.global_step)

        """

on_batch_end

def on_batch_end(
    self
) -> None

Called in the training loop after the batch.

View Source

    def on_batch_end(self) -> None:

        """

        Called in the training loop after the batch.

        """

        # do something when the batch ends

on_batch_start

def on_batch_start(
    self,
    batch: Any
) -> None

Called in the training loop before anything happens for that batch.

If you return -1 here, you will skip training for the rest of the current epoch.

Parameters:

Name	Type	Description	Default
batch	None	The batched data as it is returned by the training DataLoader.	None

View Source

    def on_batch_start(self, batch: Any) -> None:

        """

        Called in the training loop before anything happens for that batch.

        If you return -1 here, you will skip training for the rest of the current epoch.

        Args:

            batch: The batched data as it is returned by the training DataLoader.

        """

        # do something when the batch starts

on_before_zero_grad

def on_before_zero_grad(
    self,
    optimizer: torch.optim.optimizer.Optimizer
) -> None

Called after optimizer.step() and before optimizer.zero_grad().

Called in the training loop after taking an optimizer step and before zeroing grads. Good place to inspect weight information with weights updated.

This is where it is called::

for optimizer in optimizers:
    optimizer.step()
    model.on_before_zero_grad(optimizer) # < ---- called here
    optimizer.zero_grad

Parameters:

Name	Type	Description	Default
optimizer	None	The optimizer for which grads should be zeroed.	None

View Source

    def on_before_zero_grad(self, optimizer: Optimizer) -> None:

        """

        Called after optimizer.step() and before optimizer.zero_grad().

        Called in the training loop after taking an optimizer step and before zeroing grads.

        Good place to inspect weight information with weights updated.

        This is where it is called::

            for optimizer in optimizers:

                optimizer.step()

                model.on_before_zero_grad(optimizer) # < ---- called here

                optimizer.zero_grad

        Args:

            optimizer: The optimizer for which grads should be zeroed.

        """

        # do something with the optimizer or inspect it.

on_epoch_end

def on_epoch_end(
    self
) -> None

Called in the training loop at the very end of the epoch.

View Source

    def on_epoch_end(self) -> None:

        """

        Called in the training loop at the very end of the epoch.

        """

        # do something when the epoch ends

on_epoch_start

def on_epoch_start(
    self
) -> None

Called in the training loop at the very beginning of the epoch.

View Source

    def on_epoch_start(self) -> None:

        """

        Called in the training loop at the very beginning of the epoch.

        """

        # do something when the epoch starts

on_fit_end

def on_fit_end(
    self
)

Called at the very end of fit.

If on DDP it is called on every process

View Source

    def on_fit_end(self):

        """

        Called at the very end of fit.

        If on DDP it is called on every process

        """

on_fit_start

def on_fit_start(
    self
)

Called at the very beginning of fit.

If on DDP it is called on every process

View Source

    def on_fit_start(self):

        """

        Called at the very beginning of fit.

        If on DDP it is called on every process

        """

on_hpc_load

def on_hpc_load(
    self,
    checkpoint: Dict[str, Any]
) -> None

Hook to do whatever you need right before Slurm manager loads the model.

Parameters:

Name	Type	Description	Default
checkpoint	None	A dictionary with variables from the checkpoint.	None

View Source

    def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None:

        """

        Hook to do whatever you need right before Slurm manager loads the model.

        Args:

            checkpoint: A dictionary with variables from the checkpoint.

        """

on_hpc_save

def on_hpc_save(
    self,
    checkpoint: Dict[str, Any]
) -> None

Hook to do whatever you need right before Slurm manager saves the model.

Parameters:

Name	Type	Description	Default
checkpoint	None	A dictionary in which you can save variables to save in a checkpoint.
Contents need to be pickleable.	None

View Source

    def on_hpc_save(self, checkpoint: Dict[str, Any]) -> None:

        """

        Hook to do whatever you need right before Slurm manager saves the model.

        Args:

            checkpoint: A dictionary in which you can save variables to save in a checkpoint.

                Contents need to be pickleable.

        """

on_load_checkpoint

def on_load_checkpoint(
    self,
    checkpoint: Dict[str, Any]
) -> None

Called by Lightning to restore your model.

If you saved something with :meth:on_save_checkpoint this is your chance to restore this.

Parameters:

Name	Type	Description	Default
checkpoint	None	Loaded checkpoint	None

View Source

    def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:

        r"""

        Called by Lightning to restore your model.

        If you saved something with :meth:`on_save_checkpoint` this is your chance to restore this.

        Args:

            checkpoint: Loaded checkpoint

        Example:

            .. code-block:: python

                def on_load_checkpoint(self, checkpoint):

                    # 99% of the time you don't need to implement this method

                    self.something_cool_i_want_to_save = checkpoint['something_cool_i_want_to_save']

        Note:

            Lightning auto-restores global step, epoch, and train state including amp scaling.

            There is no need for you to restore anything regarding training.

        """

on_post_performance_check

def on_post_performance_check(
    self
) -> None

Called at the very end of the validation loop.

View Source

    def on_post_performance_check(self) -> None:

        """

        Called at the very end of the validation loop.

        """

        # do something before validation end

on_pre_performance_check

def on_pre_performance_check(
    self
) -> None

Called at the very beginning of the validation loop.

View Source

    def on_pre_performance_check(self) -> None:

        """

        Called at the very beginning of the validation loop.

        """

        # do something before validation starts

on_sanity_check_start

def on_sanity_check_start(
    self
)

Called before starting evaluation.

Warning: Deprecated. Will be removed in v0.9.0.

View Source

    def on_sanity_check_start(self):

        """

        Called before starting evaluation.

        Warning:

            Deprecated. Will be removed in v0.9.0.

        """

on_save_checkpoint

def on_save_checkpoint(
    self,
    checkpoint: Dict[str, Any]
) -> None

Called by Lightning when saving a checkpoint to give you a chance to store anything

else you might want to save.

Parameters:

Name	Type	Description	Default
checkpoint	None	Checkpoint to be saved	None

View Source

    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:

        r"""

        Called by Lightning when saving a checkpoint to give you a chance to store anything

        else you might want to save.

        Args:

            checkpoint: Checkpoint to be saved

        Example:

            .. code-block:: python

                def on_save_checkpoint(self, checkpoint):

                    # 99% of use cases you don't need to implement this method

                    checkpoint['something_cool_i_want_to_save'] = my_cool_pickable_object

        Note:

            Lightning saves all aspects of training (epoch, global step, etc...)

            including amp scaling.

            There is no need for you to store anything about training.

        """

on_train_end

def on_train_end(
    self
) -> None

Called at the end of training before logger experiment is closed.

View Source

    def on_train_end(self) -> None:

        """

        Called at the end of training before logger experiment is closed.

        """

        # do something at the end of training

on_train_start

def on_train_start(
    self
) -> None

Called at the beginning of training before sanity check.

View Source

    def on_train_start(self) -> None:

        """

        Called at the beginning of training before sanity check.

        """

        # do something at the start of training

optimizer_step

def optimizer_step(
    self,
    epoch: int,
    batch_idx: int,
    optimizer: torch.optim.optimizer.Optimizer,
    optimizer_idx: int,
    second_order_closure: Union[Callable, NoneType] = None,
    on_tpu: bool = False,
    using_native_amp: bool = False,
    using_lbfgs: bool = False
) -> None

Override this method to adjust the default way the

:class:~pytorch_lightning.trainer.trainer.Trainer calls each optimizer. By default, Lightning calls step() and zero_grad() as shown in the example once per optimizer.

Parameters:

Name	Type	Description	Default
epoch	None	Current epoch	None
batch_idx	None	Index of current batch	None
optimizer	None	A PyTorch optimizer	None
optimizer_idx	None	If you used multiple optimizers this indexes into that list.	None
second_order_closure	None	closure for second order methods	None
on_tpu	None	true if TPU backward is required	None
using_native_amp	None	True if using native amp	None
using_lbfgs	None	True if the matching optimizer is lbfgs	None

View Source

    def optimizer_step(

            self,

            epoch: int,

            batch_idx: int,

            optimizer: Optimizer,

            optimizer_idx: int,

            second_order_closure: Optional[Callable] = None,

            on_tpu: bool = False,

            using_native_amp: bool = False,

            using_lbfgs: bool = False,

    ) -> None:

        r"""

        Override this method to adjust the default way the

        :class:`~pytorch_lightning.trainer.trainer.Trainer` calls each optimizer.

        By default, Lightning calls ``step()`` and ``zero_grad()`` as shown in the example

        once per optimizer.

        Args:

            epoch: Current epoch

            batch_idx: Index of current batch

            optimizer: A PyTorch optimizer

            optimizer_idx: If you used multiple optimizers this indexes into that list.

            second_order_closure: closure for second order methods

            on_tpu: true if TPU backward is required

            using_native_amp: True if using native amp

            using_lbfgs: True if the matching optimizer is lbfgs

        Examples:

            .. code-block:: python

                # DEFAULT

                def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx,

                                   second_order_closure, on_tpu, using_native_amp, using_lbfgs):

                    optimizer.step()

                # Alternating schedule for optimizer steps (i.e.: GANs)

                def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx,

                                   second_order_closure, on_tpu, using_native_amp, using_lbfgs):

                    # update generator opt every 2 steps

                    if optimizer_idx == 0:

                        if batch_idx % 2 == 0 :

                            optimizer.step()

                            optimizer.zero_grad()

                    # update discriminator opt every 4 steps

                    if optimizer_idx == 1:

                        if batch_idx % 4 == 0 :

                            optimizer.step()

                            optimizer.zero_grad()

                    # ...

                    # add as many optimizers as you want

            Here's another example showing how to use this for more advanced things such as

            learning rate warm-up:

            .. code-block:: python

                # learning rate warm-up

                def optimizer_step(self, current_epoch, batch_idx, optimizer,

                                    optimizer_idx, second_order_closure, on_tpu, using_native_amp, using_lbfgs):

                    # warm up lr

                    if self.trainer.global_step < 500:

                        lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)

                        for pg in optimizer.param_groups:

                            pg['lr'] = lr_scale * self.learning_rate

                    # update params

                    optimizer.step()

                    optimizer.zero_grad()

        Note:

            If you also override the :meth:`~pytorch_lightning.core.hooks.ModelHooks.on_before_zero_grad`

            model hook don't forget to add the call to it before ``optimizer.zero_grad()`` yourself.

        """

        if on_tpu:

            xm.optimizer_step(optimizer)

        elif using_native_amp:

            self.trainer.scaler.step(optimizer)

        elif using_lbfgs:

            optimizer.step(second_order_closure)

        else:

            optimizer.step()

optimizer_zero_grad

def optimizer_zero_grad(
    self,
    epoch: int,
    batch_idx: int,
    optimizer: torch.optim.optimizer.Optimizer,
    optimizer_idx: int
)

View Source

    def optimizer_zero_grad(self,

                            epoch: int,

                            batch_idx: int,

                            optimizer: Optimizer,

                            optimizer_idx: int):

        optimizer.zero_grad()

parameters

def parameters(
    self,
    recurse: bool = True
) -> Iterator[torch.nn.parameter.Parameter]

Returns an iterator over module parameters.

This is typically passed to an optimizer.

Parameters:

Name	Type	Description
recurse	bool	if True, then yields parameters of this module
and all submodules. Otherwise, yields only parameters that
are direct members of this module.	None

Yields:

Type	Description
Parameter	module parameter
Example::

>>> for param in model.parameters():
>>>     print(type(param), param.size())
<class 'torch.Tensor'> (20L,)
<class 'torch.Tensor'> (20L, 1L, 5L, 5L) |

View Source

    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:

        r"""Returns an iterator over module parameters.

        This is typically passed to an optimizer.

        Args:

            recurse (bool): if True, then yields parameters of this module

                and all submodules. Otherwise, yields only parameters that

                are direct members of this module.

        Yields:

            Parameter: module parameter

        Example::

            >>> for param in model.parameters():

            >>>     print(type(param), param.size())

            <class 'torch.Tensor'> (20L,)

            <class 'torch.Tensor'> (20L, 1L, 5L, 5L)

        """

        for name, param in self.named_parameters(recurse=recurse):

            yield param

prepare_data

def prepare_data(
    self
) -> None

Use this to download and prepare data.

.. warning:: DO NOT set state to the model (use setup instead) since this is NOT called on every GPU in DDP/TPU

Example::

def prepare_data(self):
    # good
    download_data()
    tokenize()
    etc()

    # bad
    self.split = data_split
    self.some_state = some_other_state()

In DDP prepare_data can be called in two ways (using Trainer(prepare_data_per_node)):

Once per node. This is the default and is only called on LOCAL_RANK=0.
Once in total. Only called on GLOBAL_RANK=0.

Example::

# DEFAULT
# called once per node on LOCAL_RANK=0 of that node
Trainer(prepare_data_per_node=True)

# call on GLOBAL_RANK=0 (great for shared file systems)
Trainer(prepare_data_per_node=False)

This is called before requesting the dataloaders:

.. code-block:: python

model.prepare_data()
    if ddp/tpu: init()
model.setup(stage)
model.train_dataloader()
model.val_dataloader()
model.test_dataloader()

View Source

    def prepare_data(self) -> None:

        """

        Use this to download and prepare data.

        .. warning:: DO NOT set state to the model (use `setup` instead)

            since this is NOT called on every GPU in DDP/TPU

        Example::

            def prepare_data(self):

                # good

                download_data()

                tokenize()

                etc()

                # bad

                self.split = data_split

                self.some_state = some_other_state()

        In DDP prepare_data can be called in two ways (using Trainer(prepare_data_per_node)):

        1. Once per node. This is the default and is only called on LOCAL_RANK=0.

        2. Once in total. Only called on GLOBAL_RANK=0.

        Example::

            # DEFAULT

            # called once per node on LOCAL_RANK=0 of that node

            Trainer(prepare_data_per_node=True)

            # call on GLOBAL_RANK=0 (great for shared file systems)

            Trainer(prepare_data_per_node=False)

        This is called before requesting the dataloaders:

        .. code-block:: python

            model.prepare_data()

                if ddp/tpu: init()

            model.setup(stage)

            model.train_dataloader()

            model.val_dataloader()

            model.test_dataloader()

        """

print

def print(
    self,
    *args,
    **kwargs
) -> None

Prints only from process 0. Use this in any distributed mode to log only once.

Parameters:

Name	Type	Description	Default
*args	None	The thing to print. Will be passed to Python's built-in print function.	None
**kwargs	None	Will be passed to Python's built-in print function.	None

View Source

    def print(self, *args, **kwargs) -> None:

        r"""

        Prints only from process 0. Use this in any distributed mode to log only once.

        Args:

            *args: The thing to print. Will be passed to Python's built-in print function.

            **kwargs: Will be passed to Python's built-in print function.

        Example:

            .. code-block:: python

                def forward(self, x):

                    self.print(x, 'in forward')

        """

        if self.trainer.is_global_zero:

            print(*args, **kwargs)

register_backward_hook

def register_backward_hook(
    self,
    hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, torch.Tensor]]
) -> torch.utils.hooks.RemovableHandle

Registers a backward hook on the module.

.. warning ::

The current implementation will not have the presented behavior
for complex :class:`Module` that perform many operations.
In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only
contain the gradients for a subset of the inputs and outputs.
For such :class:`Module`, you should use :func:`torch.Tensor.register_hook`
directly on a specific input or output to get the required gradients.

The hook will be called every time the gradients with respect to module inputs are computed. The hook should have the following signature::

hook(module, grad_input, grad_output) -> Tensor or None

The :attr:grad_input and :attr:grad_output may be tuples if the module has multiple inputs or outputs. The hook should not modify its arguments, but it can optionally return a new gradient with respect to input that will be used in place of :attr:grad_input in subsequent computations. :attr:grad_input will only correspond to the inputs given as positional arguments.

Returns:

Type	Description
None	:class:`torch.utils.hooks.RemovableHandle`:
a handle that can be used to remove the added hook by calling
`handle.remove()`

View Source

    def register_backward_hook(

        self, hook: Callable[['Module', _grad_t, _grad_t], Union[None, Tensor]]

    ) -> RemovableHandle:

        r"""Registers a backward hook on the module.

        .. warning ::

            The current implementation will not have the presented behavior

            for complex :class:`Module` that perform many operations.

            In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only

            contain the gradients for a subset of the inputs and outputs.

            For such :class:`Module`, you should use :func:`torch.Tensor.register_hook`

            directly on a specific input or output to get the required gradients.

        The hook will be called every time the gradients with respect to module

        inputs are computed. The hook should have the following signature::

            hook(module, grad_input, grad_output) -> Tensor or None

        The :attr:`grad_input` and :attr:`grad_output` may be tuples if the

        module has multiple inputs or outputs. The hook should not modify its

        arguments, but it can optionally return a new gradient with respect to

        input that will be used in place of :attr:`grad_input` in subsequent

        computations. :attr:`grad_input` will only correspond to the inputs given

        as positional arguments.

        Returns:

            :class:`torch.utils.hooks.RemovableHandle`:

                a handle that can be used to remove the added hook by calling

                ``handle.remove()``

        """

        handle = hooks.RemovableHandle(self._backward_hooks)

        self._backward_hooks[handle.id] = hook

        return handle

register_buffer

def register_buffer(
    self,
    name: str,
    tensor: torch.Tensor,
    persistent: bool = True
) -> None

Adds a buffer to the module.

This is typically used to register a buffer that should not to be considered a model parameter. For example, BatchNorm's running_mean is not a parameter, but is part of the module's state. Buffers, by default, are persistent and will be saved alongside parameters. This behavior can be changed by setting :attr:persistent to False. The only difference between a persistent buffer and a non-persistent buffer is that the latter will not be a part of this module's :attr:state_dict.

Buffers can be accessed as attributes using given names.

Args: name (string): name of the buffer. The buffer can be accessed from this module using the given name tensor (Tensor): buffer to be registered. persistent (bool): whether the buffer is part of this module's :attr:state_dict.

Example::

>>> self.register_buffer('running_mean', torch.zeros(num_features))

View Source

    def register_buffer(self, name: str, tensor: Tensor, persistent: bool = True) -> None:

        r"""Adds a buffer to the module.

        This is typically used to register a buffer that should not to be

        considered a model parameter. For example, BatchNorm's ``running_mean``

        is not a parameter, but is part of the module's state. Buffers, by

        default, are persistent and will be saved alongside parameters. This

        behavior can be changed by setting :attr:`persistent` to ``False``. The

        only difference between a persistent buffer and a non-persistent buffer

        is that the latter will not be a part of this module's

        :attr:`state_dict`.

        Buffers can be accessed as attributes using given names.

        Args:

            name (string): name of the buffer. The buffer can be accessed

                from this module using the given name

            tensor (Tensor): buffer to be registered.

            persistent (bool): whether the buffer is part of this module's

                :attr:`state_dict`.

        Example::

            >>> self.register_buffer('running_mean', torch.zeros(num_features))

        """

        if persistent is False and isinstance(self, torch.jit.ScriptModule):

            raise RuntimeError("ScriptModule does not support non-persistent buffers")

        if '_buffers' not in self.__dict__:

            raise AttributeError(

                "cannot assign buffer before Module.__init__() call")

        elif not isinstance(name, torch._six.string_classes):

            raise TypeError("buffer name should be a string. "

                            "Got {}".format(torch.typename(name)))

        elif '.' in name:

            raise KeyError("buffer name can't contain \".\"")

        elif name == '':

            raise KeyError("buffer name can't be empty string \"\"")

        elif hasattr(self, name) and name not in self._buffers:

            raise KeyError("attribute '{}' already exists".format(name))

        elif tensor is not None and not isinstance(tensor, torch.Tensor):

            raise TypeError("cannot assign '{}' object to buffer '{}' "

                            "(torch Tensor or None required)"

                            .format(torch.typename(tensor), name))

        else:

            self._buffers[name] = tensor

            if persistent:

                self._non_persistent_buffers_set.discard(name)

            else:

                self._non_persistent_buffers_set.add(name)

register_forward_hook

def register_forward_hook(
    self,
    hook: Callable[..., NoneType]
) -> torch.utils.hooks.RemovableHandle

Registers a forward hook on the module.

The hook will be called every time after :func:forward has computed an output. It should have the following signature::

hook(module, input, output) -> None or modified output

The input contains only the positional arguments given to the module. Keyword arguments won't be passed to the hooks and only to the forward. The hook can modify the output. It can modify the input inplace but it will not have effect on forward since this is called after

View Source

    def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle:

        r"""Registers a forward hook on the module.

        The hook will be called every time after :func:`forward` has computed an output.

        It should have the following signature::

            hook(module, input, output) -> None or modified output

        The input contains only the positional arguments given to the module.

        Keyword arguments won't be passed to the hooks and only to the ``forward``.

        The hook can modify the output. It can modify the input inplace but

        it will not have effect on forward since this is called after

        :func:`forward` is called.

        Returns:

            :class:`torch.utils.hooks.RemovableHandle`:

                a handle that can be used to remove the added hook by calling

                ``handle.remove()``

        """

        handle = hooks.RemovableHandle(self._forward_hooks)

        self._forward_hooks[handle.id] = hook

        return handle

register_forward_pre_hook

def register_forward_pre_hook(
    self,
    hook: Callable[..., NoneType]
) -> torch.utils.hooks.RemovableHandle

Registers a forward pre-hook on the module.

The hook will be called every time before :func:forward is invoked. It should have the following signature::

hook(module, input) -> None or modified input

The input contains only the positional arguments given to the module. Keyword arguments won't be passed to the hooks and only to the forward. The hook can modify the input. User can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if a single value is returned(unless that value is already a tuple).

Returns:

Type	Description
None	:class:`torch.utils.hooks.RemovableHandle`:
a handle that can be used to remove the added hook by calling
`handle.remove()`

View Source

    def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandle:

        r"""Registers a forward pre-hook on the module.

        The hook will be called every time before :func:`forward` is invoked.

        It should have the following signature::

            hook(module, input) -> None or modified input

        The input contains only the positional arguments given to the module.

        Keyword arguments won't be passed to the hooks and only to the ``forward``.

        The hook can modify the input. User can either return a tuple or a

        single modified value in the hook. We will wrap the value into a tuple

        if a single value is returned(unless that value is already a tuple).

        Returns:

            :class:`torch.utils.hooks.RemovableHandle`:

                a handle that can be used to remove the added hook by calling

                ``handle.remove()``

        """

        handle = hooks.RemovableHandle(self._forward_pre_hooks)

        self._forward_pre_hooks[handle.id] = hook

        return handle

register_parameter

def register_parameter(
    self,
    name: str,
    param: torch.nn.parameter.Parameter
) -> None

Adds a parameter to the module.

The parameter can be accessed as an attribute using given name.

Parameters:

Name	Type	Description	Default
name	string	name of the parameter. The parameter can be accessed
from this module using the given name	None
param	Parameter	parameter to be added to the module.	None

View Source

    def register_parameter(self, name: str, param: Parameter) -> None:

        r"""Adds a parameter to the module.

        The parameter can be accessed as an attribute using given name.

        Args:

            name (string): name of the parameter. The parameter can be accessed

                from this module using the given name

            param (Parameter): parameter to be added to the module.

        """

        if '_parameters' not in self.__dict__:

            raise AttributeError(

                "cannot assign parameter before Module.__init__() call")

        elif not isinstance(name, torch._six.string_classes):

            raise TypeError("parameter name should be a string. "

                            "Got {}".format(torch.typename(name)))

        elif '.' in name:

            raise KeyError("parameter name can't contain \".\"")

        elif name == '':

            raise KeyError("parameter name can't be empty string \"\"")

        elif hasattr(self, name) and name not in self._parameters:

            raise KeyError("attribute '{}' already exists".format(name))

        if param is None:

            self._parameters[name] = None

        elif not isinstance(param, Parameter):

            raise TypeError("cannot assign '{}' object to parameter '{}' "

                            "(torch.nn.Parameter or None required)"

                            .format(torch.typename(param), name))

        elif param.grad_fn:

            raise ValueError(

                "Cannot assign non-leaf Tensor to parameter '{0}'. Model "

                "parameters must be created explicitly. To express '{0}' "

                "as a function of another Tensor, compute the value in "

                "the forward() method.".format(name))

        else:

            self._parameters[name] = param

requires_grad_

def requires_grad_(
    self: ~T,
    requires_grad: bool = True
) -> ~T

Change if autograd should record operations on parameters in this

module.

This method sets the parameters' :attr:requires_grad attributes in-place.

This method is helpful for freezing part of the module for finetuning or training parts of a model individually (e.g., GAN training).

Parameters:

Name	Type	Description	Default
requires_grad	bool	whether autograd should record operations on
parameters in this module. Default: `True`.	None

Returns:

Type	Description
Module	self

View Source

    def requires_grad_(self: T, requires_grad: bool = True) -> T:

        r"""Change if autograd should record operations on parameters in this

        module.

        This method sets the parameters' :attr:`requires_grad` attributes

        in-place.

        This method is helpful for freezing part of the module for finetuning

        or training parts of a model individually (e.g., GAN training).

        Args:

            requires_grad (bool): whether autograd should record operations on

                                  parameters in this module. Default: ``True``.

        Returns:

            Module: self

        """

        for p in self.parameters():

            p.requires_grad_(requires_grad)

        return self

save_hyperparameters

def save_hyperparameters(
    self,
    *args,
    frame=None
) -> None

Save all model arguments.

Parameters:

Name	Type	Description	Default
args	None	single object of `dict`, `NameSpace` or `OmegaConf`
or string names or argumenst from class `__init__`

from collections import OrderedDict class ManuallyArgsModel(LightningModule): ... def init(self, arg1, arg2, arg3): ... super().init() ... # manually assine arguments ... self.save_hyperparameters('arg1', 'arg3') ... def forward(self, args, *kwargs): ... ... model = ManuallyArgsModel(1, 'abc', 3.14) model.hparams "arg1": 1 "arg3": 3.14

class AutomaticArgsModel(LightningModule): ... def init(self, arg1, arg2, arg3): ... super().init() ... # equivalent automatic ... self.save_hyperparameters() ... def forward(self, args, *kwargs): ... ... model = AutomaticArgsModel(1, 'abc', 3.14) model.hparams "arg1": 1 "arg2": abc "arg3": 3.14

class SingleArgModel(LightningModule): ... def init(self, params): ... super().init() ... # manually assign single argument ... self.save_hyperparameters(params) ... def forward(self, args, *kwargs): ... ... model = SingleArgModel(Namespace(p1=1, p2='abc', p3=3.14)) model.hparams "p1": 1 "p2": abc "p3": 3.14 | None |

View Source

    def save_hyperparameters(self, *args, frame=None) -> None:

        """Save all model arguments.

        Args:

            args: single object of `dict`, `NameSpace` or `OmegaConf`

             or string names or argumenst from class `__init__`

        >>> from collections import OrderedDict

        >>> class ManuallyArgsModel(LightningModule):

        ...     def __init__(self, arg1, arg2, arg3):

        ...         super().__init__()

        ...         # manually assine arguments

        ...         self.save_hyperparameters('arg1', 'arg3')

        ...     def forward(self, *args, **kwargs):

        ...         ...

        >>> model = ManuallyArgsModel(1, 'abc', 3.14)

        >>> model.hparams

        "arg1": 1

        "arg3": 3.14

        >>> class AutomaticArgsModel(LightningModule):

        ...     def __init__(self, arg1, arg2, arg3):

        ...         super().__init__()

        ...         # equivalent automatic

        ...         self.save_hyperparameters()

        ...     def forward(self, *args, **kwargs):

        ...         ...

        >>> model = AutomaticArgsModel(1, 'abc', 3.14)

        >>> model.hparams

        "arg1": 1

        "arg2": abc

        "arg3": 3.14

        >>> class SingleArgModel(LightningModule):

        ...     def __init__(self, params):

        ...         super().__init__()

        ...         # manually assign single argument

        ...         self.save_hyperparameters(params)

        ...     def forward(self, *args, **kwargs):

        ...         ...

        >>> model = SingleArgModel(Namespace(p1=1, p2='abc', p3=3.14))

        >>> model.hparams

        "p1": 1

        "p2": abc

        "p3": 3.14

        """

        if not frame:

            frame = inspect.currentframe().f_back

        init_args = get_init_args(frame)

        assert init_args, 'failed to inspect the self init'

        if not args:

            hp = init_args

            self._hparams_name = 'kwargs' if hp else None

        else:

            isx_non_str = [i for i, arg in enumerate(args) if not isinstance(arg, str)]

            if len(isx_non_str) == 1:

                hp = args[isx_non_str[0]]

                cand_names = [k for k, v in init_args.items() if v == hp]

                self._hparams_name = cand_names[0] if cand_names else None

            else:

                hp = {arg: init_args[arg] for arg in args if isinstance(arg, str)}

                self._hparams_name = 'kwargs'

        # `hparams` are expected here

        if hp:

            self._set_hparams(hp)

setup

def setup(
    self,
    stage
) -> None

View Source

    def setup(self, stage) -> None:

        if self.train_rpn:  # step 2

            self.mfn = load_checkpoint(

                Classification(self.backbone, self.num_classes - 1).mfn, self.pretrained_mfn_ckpt, "model.mfn"

            )

            self.rpn = RPN()

        elif self.train_roi:  # step 3

            self.mfn = load_checkpoint(

                Classification(self.backbone, self.num_classes - 1).mfn, self.pretrained_rpn_ckpt, prefix="mfn"

            )

            freeze(self.mfn)

            self.rpn = load_checkpoint(RPN(), self.pretrained_rpn_ckpt, prefix="rpn")

            freeze(self.rpn)

            self.roi = RoI(self.num_classes)

        elif self.finetune_rpn:  # step 4 or extra finetune rpn

            if self.finetuned_rpn_ckpt and self.finetuned_roi_ckpt:  # extra finetune rpn

                self.mfn = load_checkpoint(

                    Classification(self.backbone, self.num_classes - 1).mfn, self.finetuned_rpn_ckpt, prefix="mfn"

                )

                freeze(self.mfn)

                self.rpn = load_checkpoint(RPN(), self.finetuned_rpn_ckpt, prefix="rpn")

                self.roi = load_checkpoint(RoI(self.num_classes), self.finetuned_roi_ckpt, prefix="roi")

                freeze(self.roi)

                self.model = Detection(self.mfn, self.rpn, self.roi)

            else:

                self.mfn = load_checkpoint(

                    Classification(self.backbone, self.num_classes - 1).mfn, self.pretrained_rpn_ckpt, prefix="mfn"

                )

                freeze(self.mfn)

                self.rpn = load_checkpoint(RPN(), self.pretrained_rpn_ckpt, prefix="rpn")

                self.roi = load_checkpoint(RoI(self.num_classes), self.pretrained_roi_ckpt, prefix="roi")

                freeze(self.roi)

                self.model = Detection(self.mfn, self.rpn, self.roi)

        elif self.finetune_roi:  # step 5 or extra finetune roi

            if self.finetuned_rpn_ckpt and self.finetuned_roi_ckpt:  # extra finetune roi

                self.mfn = load_checkpoint(

                    Classification(self.backbone, self.num_classes - 1).mfn, self.finetuned_rpn_ckpt, prefix="mfn"

                )

                freeze(self.mfn)

                self.rpn = load_checkpoint(RPN(), self.finetuned_rpn_ckpt, prefix="rpn")

                freeze(self.rpn)

                self.roi = load_checkpoint(RoI(self.num_classes), self.finetuned_roi_ckpt, prefix="roi")

                self.model = Detection(self.mfn, self.rpn, self.roi)

            else:

                self.mfn = load_checkpoint(

                    Classification(self.backbone, self.num_classes - 1).mfn, self.finetuned_rpn_ckpt, prefix="mfn"

                )

                freeze(self.mfn)

                self.rpn = load_checkpoint(RPN(), self.finetuned_rpn_ckpt, prefix="rpn")

                freeze(self.rpn)

                self.roi = load_checkpoint(RoI(self.num_classes), self.pretrained_roi_ckpt, prefix="roi")

                self.model = Detection(self.mfn, self.rpn, self.roi)

        else:  # step 6: final/joint model

            load_checkpoint_fn = load_checkpoint

            if self.finetuned_roi_ckpt is not None:

                ckpt_path = self.finetuned_rpn_ckpt

            elif self.resume_sagemaker_from_checkpoint is not None:

                ckpt_path = self.resume_sagemaker_from_checkpoint

            else:

                ckpt_path = None

                # ignore load_checkpoint

                load_checkpoint_fn = lambda *args: args[0]

            self.mfn = load_checkpoint_fn(Classification(self.backbone, self.num_classes - 1).mfn, ckpt_path, "mfn")

            self.rpn = load_checkpoint_fn(RPN(), ckpt_path, "rpn")

            self.roi = load_checkpoint_fn(RoI(self.num_classes), ckpt_path, "roi")

            self.model = Detection(self.mfn, self.rpn, self.roi)

        return

share_memory

def share_memory(
    self: ~T
) -> ~T

View Source

    def share_memory(self: T) -> T:

        return self._apply(lambda t: t.share_memory_())

state_dict

def state_dict(
    self,
    destination=None,
    prefix='',
    keep_vars=False
)

Returns a dictionary containing a whole state of the module.

Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names.

Returns:

Type	Description
dict	a dictionary containing a whole state of the module

Example::

>>> module.state_dict().keys()
['bias', 'weight'] |

View Source

    def state_dict(self, destination=None, prefix='', keep_vars=False):

        r"""Returns a dictionary containing a whole state of the module.

        Both parameters and persistent buffers (e.g. running averages) are

        included. Keys are corresponding parameter and buffer names.

        Returns:

            dict:

                a dictionary containing a whole state of the module

        Example::

            >>> module.state_dict().keys()

            ['bias', 'weight']

        """

        if destination is None:

            destination = OrderedDict()

            destination._metadata = OrderedDict()

        destination._metadata[prefix[:-1]] = local_metadata = dict(version=self._version)

        self._save_to_state_dict(destination, prefix, keep_vars)

        for name, module in self._modules.items():

            if module is not None:

                module.state_dict(destination, prefix + name + '.', keep_vars=keep_vars)

        for hook in self._state_dict_hooks.values():

            hook_result = hook(self, destination, prefix, local_metadata)

            if hook_result is not None:

                destination = hook_result

        return destination

summarize

def summarize(
    self,
    mode: str = 'top'
) -> pytorch_lightning.core.memory.ModelSummary

View Source

    def summarize(self, mode: str = ModelSummary.MODE_DEFAULT) -> ModelSummary:

        model_summary = ModelSummary(self, mode=mode)

        log.info('\n' + str(model_summary))

        return model_summary

tbptt_split_batch

def tbptt_split_batch(
    self,
    batch: torch.Tensor,
    split_size: int
) -> list

When using truncated backpropagation through time, each batch must be split along the time dimension. Lightning handles this by default, but for custom behavior override this function.

Args: batch: Current batch split_size: The size of the split

Return: List of batch splits. Each split will be passed to :meth:training_step to enable truncated back propagation through time. The default implementation splits root level Tensors and Sequences at dim=1 (i.e. time dim). It assumes that each time dim is the same length.

Examples: .. code-block:: python

    def tbptt_split_batch(self, batch, split_size):
      splits = []
      for t in range(0, time_dims[0], split_size):
          batch_split = []
          for i, x in enumerate(batch):
              if isinstance(x, torch.Tensor):
                  split_x = x[:, t:t + split_size]
              elif isinstance(x, collections.Sequence):
                  split_x = [None] * len(x)
                  for batch_idx in range(len(x)):
                      split_x[batch_idx] = x[batch_idx][t:t + split_size]

              batch_split.append(split_x)

          splits.append(batch_split)

      return splits

Note: Called in the training loop after :meth:~pytorch_lightning.callbacks.base.Callback.on_batch_start if :paramref:~pytorch_lightning.trainer.Trainer.truncated_bptt_steps > 0. Each returned batch split is passed separately to :meth:training_step.

View Source

    def tbptt_split_batch(self, batch: Tensor, split_size: int) -> list:

        r"""

        When using truncated backpropagation through time, each batch must be split along the

        time dimension. Lightning handles this by default, but for custom behavior override

        this function.

        Args:

            batch: Current batch

            split_size: The size of the split

        Return:

            List of batch splits. Each split will be passed to :meth:`training_step` to enable truncated

            back propagation through time. The default implementation splits root level Tensors and

            Sequences at dim=1 (i.e. time dim). It assumes that each time dim is the same length.

        Examples:

            .. code-block:: python

                def tbptt_split_batch(self, batch, split_size):

                  splits = []

                  for t in range(0, time_dims[0], split_size):

                      batch_split = []

                      for i, x in enumerate(batch):

                          if isinstance(x, torch.Tensor):

                              split_x = x[:, t:t + split_size]

                          elif isinstance(x, collections.Sequence):

                              split_x = [None] * len(x)

                              for batch_idx in range(len(x)):

                                  split_x[batch_idx] = x[batch_idx][t:t + split_size]

                          batch_split.append(split_x)

                      splits.append(batch_split)

                  return splits

        Note:

            Called in the training loop after

            :meth:`~pytorch_lightning.callbacks.base.Callback.on_batch_start`

            if :paramref:`~pytorch_lightning.trainer.Trainer.truncated_bptt_steps` > 0.

            Each returned batch split is passed separately to :meth:`training_step`.

        """

        time_dims = [len(x[0]) for x in batch if isinstance(x, (torch.Tensor, collections.Sequence))]

        assert len(time_dims) >= 1, "Unable to determine batch time dimension"

        assert all(x == time_dims[0] for x in time_dims), "Batch time dimension length is ambiguous"

        splits = []

        for t in range(0, time_dims[0], split_size):

            batch_split = []

            for i, x in enumerate(batch):

                if isinstance(x, torch.Tensor):

                    split_x = x[:, t:t + split_size]

                elif isinstance(x, collections.Sequence):

                    split_x = [None] * len(x)

                    for batch_idx in range(len(x)):

                        split_x[batch_idx] = x[batch_idx][t:t + split_size]

                batch_split.append(split_x)

            splits.append(batch_split)

        return splits

teardown

def teardown(
    self,
    stage: str
)

Called at the end of fit and test.

Parameters:

Name	Type	Description	Default
stage	None	either 'fit' or 'test'	None

View Source

    def teardown(self, stage: str):

        """

        Called at the end of fit and test.

        Args:

            stage: either 'fit' or 'test'

        """

test_dataloader

def test_dataloader(
    self
) -> Union[torch.utils.data.dataloader.DataLoader, List[torch.utils.data.dataloader.DataLoader]]

Implement one or multiple PyTorch DataLoaders for testing.

The dataloader you return will not be called every epoch unless you set

View Source

    def test_dataloader(self) -> Union[DataLoader, List[DataLoader]]:

        r"""

        Implement one or multiple PyTorch DataLoaders for testing.

        The dataloader you return will not be called every epoch unless you set

        :paramref:`~pytorch_lightning.trainer.Trainer.reload_dataloaders_every_epoch` to ``True``.

        For data processing use the following pattern:

            - download in :meth:`prepare_data`

            - process and split in :meth:`setup`

        However, the above are only necessary for distributed processing.

        .. warning:: do not assign state in prepare_data

        - :meth:`~pytorch_lightning.trainer.Trainer.fit`

        - ...

        - :meth:`prepare_data`

        - :meth:`setup`

        - :meth:`train_dataloader`

        - :meth:`val_dataloader`

        - :meth:`test_dataloader`

        Note:

            Lightning adds the correct sampler for distributed and arbitrary hardware.

            There is no need to set it yourself.

        Return:

            Single or multiple PyTorch DataLoaders.

        Example:

            .. code-block:: python

                def test_dataloader(self):

                    transform = transforms.Compose([transforms.ToTensor(),

                                                    transforms.Normalize((0.5,), (1.0,))])

                    dataset = MNIST(root='/path/to/mnist/', train=False, transform=transform,

                                    download=True)

                    loader = torch.utils.data.DataLoader(

                        dataset=dataset,

                        batch_size=self.batch_size,

                        shuffle=False

                    )

                    return loader

        Note:

            If you don't need a test dataset and a :meth:`test_step`, you don't need to implement

            this method.

        """

test_end

def test_end(
    self,
    outputs
)

Warnings:

Deprecated in v0.7.0. Use :meth:test_epoch_end instead. Will be removed in 1.0.0.

View Source

    def test_end(self, outputs):

        """

        Warnings:

             Deprecated in v0.7.0. Use :meth:`test_epoch_end` instead.

             Will be removed in 1.0.0.

        """

test_epoch_end

def test_epoch_end(
    self,
    outputs: Union[List[Dict[str, torch.Tensor]], List[List[Dict[str, torch.Tensor]]]]
) -> Dict[str, Dict[str, torch.Tensor]]

Called at the end of a test epoch with the output of all test steps.

.. code-block:: python

# the pseudocode for these calls
test_outs = []
for test_batch in test_data:
    out = test_step(test_batch)
    test_outs.append(out)
test_epoch_end(test_outs)

Args: outputs: List of outputs you defined in :meth:test_step_end, or if there are multiple dataloaders, a list containing a list of outputs for each dataloader

Return: Dict or OrderedDict: Dict has the following optional keys:

- progress_bar -> Dict for progress bar display. Must have only tensors.
- log -> Dict of metrics to add to logger. Must have only tensors (no images, etc).

Note: If you didn't define a :meth:test_step, this won't be called.

The outputs here are strictly for logging or progress bar.
If you don't need to display anything, don't return anything.
If you want to manually set current step, specify it with the 'step' key in the 'log' Dict

Examples: With a single dataloader:

.. code-block:: python

    def test_epoch_end(self, outputs):
        test_acc_mean = 0
        for output in outputs:
            test_acc_mean += output['test_acc']

        test_acc_mean /= len(outputs)
        tqdm_dict = {'test_acc': test_acc_mean.item()}

        # show test_loss and test_acc in progress bar but only log test_loss
        results = {
            'progress_bar': tqdm_dict,
            'log': {'test_acc': test_acc_mean.item()}
        }
        return results

With multiple dataloaders, `outputs` will be a list of lists. The outer list contains
one entry per dataloader, while the inner list contains the individual outputs of
each test step for that dataloader.

.. code-block:: python

    def test_epoch_end(self, outputs):
        test_acc_mean = 0
        i = 0
        for dataloader_outputs in outputs:
            for output in dataloader_outputs:
                test_acc_mean += output['test_acc']
                i += 1

        test_acc_mean /= i
        tqdm_dict = {'test_acc': test_acc_mean.item()}

        # show test_loss and test_acc in progress bar but only log test_loss
        results = {
            'progress_bar': tqdm_dict,
            'log': {'test_acc': test_acc_mean.item(), 'step': self.current_epoch}
        }
        return results

View Source

    def test_epoch_end(

            self,

            outputs: Union[List[Dict[str, Tensor]], List[List[Dict[str, Tensor]]]]

    ) -> Dict[str, Dict[str, Tensor]]:

        """

        Called at the end of a test epoch with the output of all test steps.

        .. code-block:: python

            # the pseudocode for these calls

            test_outs = []

            for test_batch in test_data:

                out = test_step(test_batch)

                test_outs.append(out)

            test_epoch_end(test_outs)

        Args:

            outputs: List of outputs you defined in :meth:`test_step_end`, or if there

                are multiple dataloaders, a list containing a list of outputs for each dataloader

        Return:

            Dict or OrderedDict: Dict has the following optional keys:

            - progress_bar -> Dict for progress bar display. Must have only tensors.

            - log -> Dict of metrics to add to logger. Must have only tensors (no images, etc).

        Note:

            If you didn't define a :meth:`test_step`, this won't be called.

        - The outputs here are strictly for logging or progress bar.

        - If you don't need to display anything, don't return anything.

        - If you want to manually set current step, specify it with the 'step' key in the 'log' Dict

        Examples:

            With a single dataloader:

            .. code-block:: python

                def test_epoch_end(self, outputs):

                    test_acc_mean = 0

                    for output in outputs:

                        test_acc_mean += output['test_acc']

                    test_acc_mean /= len(outputs)

                    tqdm_dict = {'test_acc': test_acc_mean.item()}

                    # show test_loss and test_acc in progress bar but only log test_loss

                    results = {

                        'progress_bar': tqdm_dict,

                        'log': {'test_acc': test_acc_mean.item()}

                    }

                    return results

            With multiple dataloaders, `outputs` will be a list of lists. The outer list contains

            one entry per dataloader, while the inner list contains the individual outputs of

            each test step for that dataloader.

            .. code-block:: python

                def test_epoch_end(self, outputs):

                    test_acc_mean = 0

                    i = 0

                    for dataloader_outputs in outputs:

                        for output in dataloader_outputs:

                            test_acc_mean += output['test_acc']

                            i += 1

                    test_acc_mean /= i

                    tqdm_dict = {'test_acc': test_acc_mean.item()}

                    # show test_loss and test_acc in progress bar but only log test_loss

                    results = {

                        'progress_bar': tqdm_dict,

                        'log': {'test_acc': test_acc_mean.item(), 'step': self.current_epoch}

                    }

                    return results

        """

test_step

def test_step(
    self,
    *args,
    **kwargs
) -> Dict[str, torch.Tensor]

Operates on a single batch of data from the test set. In this step you'd normally generate examples or calculate anything of interest such as accuracy.

.. code-block:: python

# the pseudocode for these calls
test_outs = []
for test_batch in test_data:
    out = test_step(test_batch)
    test_outs.append(out)
test_epoch_end(test_outs)

Args: batch (:class:~torch.Tensor | (:class:~torch.Tensor, ...) | [:class:~torch.Tensor, ...]): The output of your :class:~torch.utils.data.DataLoader. A tensor, tuple or list. batch_idx (int): The index of this batch. dataloader_idx (int): The index of the dataloader that produced this batch (only if multiple test datasets used).

Return: Dict or OrderedDict - passed to the :meth:test_epoch_end method. If you defined :meth:test_step_end it will go to that first.

.. code-block:: python

# if you have one test dataloader:
def test_step(self, batch, batch_idx)

# if you have multiple test dataloaders:
def test_step(self, batch, batch_idx, dataloader_idx)

Examples: .. code-block:: python

    # CASE 1: A single test dataset
    def test_step(self, batch, batch_idx):
        x, y = batch

        # implement your own
        out = self(x)
        loss = self.loss(out, y)

        # log 6 example images
        # or generated text... or whatever
        sample_imgs = x[:6]
        grid = torchvision.utils.make_grid(sample_imgs)
        self.logger.experiment.add_image('example_images', grid, 0)

        # calculate acc
        labels_hat = torch.argmax(out, dim=1)
        val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)

        # all optional...
        # return whatever you need for the collation function test_epoch_end
        output = OrderedDict({
            'val_loss': loss_val,
            'val_acc': torch.tensor(val_acc), # everything must be a tensor
        })

        # return an optional dict
        return output

If you pass in multiple validation datasets, :meth:`test_step` will have an additional
argument.

.. code-block:: python

    # CASE 2: multiple test datasets
    def test_step(self, batch, batch_idx, dataset_idx):
        # dataset_idx tells you which dataset this is.

Note: If you don't need to validate you don't need to implement this method.

Note: When the :meth:test_step is called, the model has been put in eval mode and PyTorch gradients have been disabled. At the end of the test epoch, the model goes back to training mode and gradients are enabled.

View Source

    def test_step(self, *args, **kwargs) -> Dict[str, Tensor]:

        r"""

        Operates on a single batch of data from the test set.

        In this step you'd normally generate examples or calculate anything of interest

        such as accuracy.

        .. code-block:: python

            # the pseudocode for these calls

            test_outs = []

            for test_batch in test_data:

                out = test_step(test_batch)

                test_outs.append(out)

            test_epoch_end(test_outs)

        Args:

            batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):

                The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.

            batch_idx (int): The index of this batch.

            dataloader_idx (int): The index of the dataloader that produced this batch

                (only if multiple test datasets used).

        Return:

            Dict or OrderedDict - passed to the :meth:`test_epoch_end` method.

            If you defined :meth:`test_step_end` it will go to that first.

        .. code-block:: python

            # if you have one test dataloader:

            def test_step(self, batch, batch_idx)

            # if you have multiple test dataloaders:

            def test_step(self, batch, batch_idx, dataloader_idx)

        Examples:

            .. code-block:: python

                # CASE 1: A single test dataset

                def test_step(self, batch, batch_idx):

                    x, y = batch

                    # implement your own

                    out = self(x)

                    loss = self.loss(out, y)

                    # log 6 example images

                    # or generated text... or whatever

                    sample_imgs = x[:6]

                    grid = torchvision.utils.make_grid(sample_imgs)

                    self.logger.experiment.add_image('example_images', grid, 0)

                    # calculate acc

                    labels_hat = torch.argmax(out, dim=1)

                    val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)

                    # all optional...

                    # return whatever you need for the collation function test_epoch_end

                    output = OrderedDict({

                        'val_loss': loss_val,

                        'val_acc': torch.tensor(val_acc), # everything must be a tensor

                    })

                    # return an optional dict

                    return output

            If you pass in multiple validation datasets, :meth:`test_step` will have an additional

            argument.

            .. code-block:: python

                # CASE 2: multiple test datasets

                def test_step(self, batch, batch_idx, dataset_idx):

                    # dataset_idx tells you which dataset this is.

        Note:

            If you don't need to validate you don't need to implement this method.

        Note:

            When the :meth:`test_step` is called, the model has been put in eval mode and

            PyTorch gradients have been disabled. At the end of the test epoch, the model goes back

            to training mode and gradients are enabled.

        """

test_step_end

def test_step_end(
    self,
    *args,
    **kwargs
) -> Dict[str, torch.Tensor]

Use this when testing with dp or ddp2 because :meth:test_step will operate

on only part of the batch. However, this is still optional and only needed for things like softmax or NCE loss.

Note: If you later switch to ddp or some other mode, this will still be called so that you don't have to change your code.

.. code-block:: python

# pseudocode
sub_batches = split_batches_for_dp(batch)
batch_parts_outputs = [test_step(sub_batch) for sub_batch in sub_batches]
test_step_end(batch_parts_outputs)

Parameters:

Name	Type	Description
batch_parts_outputs	None	What you return in :meth:`test_step` for each batch part.
Return:
Dict or OrderedDict - passed to the :meth:`test_epoch_end`.	None

View Source

    def test_step_end(self, *args, **kwargs) -> Dict[str, Tensor]:

        """

        Use this when testing with dp or ddp2 because :meth:`test_step` will operate

        on only part of the batch. However, this is still optional

        and only needed for things like softmax or NCE loss.

        Note:

            If you later switch to ddp or some other mode, this will still be called

            so that you don't have to change your code.

        .. code-block:: python

            # pseudocode

            sub_batches = split_batches_for_dp(batch)

            batch_parts_outputs = [test_step(sub_batch) for sub_batch in sub_batches]

            test_step_end(batch_parts_outputs)

        Args:

            batch_parts_outputs: What you return in :meth:`test_step` for each batch part.

        Return:

             Dict or OrderedDict - passed to the :meth:`test_epoch_end`.

        Examples:

            .. code-block:: python

                # WITHOUT test_step_end

                # if used in DP or DDP2, this batch is 1/num_gpus large

                def test_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    loss = self.softmax(out)

                    loss = nce_loss(loss)

                    return {'loss': loss}

                # --------------

                # with test_step_end to do softmax over the full batch

                def test_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    return {'out': out}

                def test_step_end(self, outputs):

                    # this out is now the full size of the batch

                    out = outputs['out']

                    # this softmax now uses the full batch size

                    loss = nce_loss(loss)

                    return {'loss': loss}

        See Also:

            See the :ref:`multi-gpu-training` guide for more details.

        """

tng_dataloader

def tng_dataloader(
    self
)

Warnings:

Deprecated in v0.5.0. Use :meth:train_dataloader instead. Will be removed in 1.0.0.

View Source

    def tng_dataloader(self):  # todo: remove in v1.0.0

        """

        Warnings:

            Deprecated in v0.5.0. Use :meth:`train_dataloader` instead. Will be removed in 1.0.0.

        """

        output = self.train_dataloader()

        rank_zero_warn("`tng_dataloader` has been renamed to `train_dataloader` since v0.5.0."

                       " and this method will be removed in v1.0.0", DeprecationWarning)

        return output

to

def to(
    self,
    *args,
    **kwargs
) -> torch.nn.modules.module.Module

Moves and/or casts the parameters and buffers.

This can be called as .. function:: to(device=None, dtype=None, non_blocking=False) .. function:: to(dtype, non_blocking=False) .. function:: to(tensor, non_blocking=False) Its signature is similar to :meth:torch.Tensor.to, but only accepts floating point desired :attr:dtype s. In addition, this method will only cast the floating point parameters and buffers to :attr:dtype (if given). The integral parameters and buffers will be moved :attr:device, if that is given, but with dtypes unchanged. When :attr:non_blocking is set, it tries to convert/move asynchronously with respect to the host if possible, e.g., moving CPU Tensors with pinned memory to CUDA devices. See below for examples.

Note: This method modifies the module in-place.

Parameters:

Name	Type	Description
device	None	the desired device of the parameters
and buffers in this module	None
dtype	None	the desired floating point type of
the floating point parameters and buffers in this module	None
tensor	None	Tensor whose dtype and device are the desired
dtype and device for all parameters and buffers in this module	None

Returns:

Type	Description
Module	self
Example::
>>> class ExampleModule(DeviceDtypeModuleMixin):
... def init(self, weight: torch.Tensor):
... super().init()
... self.register_buffer('weight', weight)
>>> _ = torch.manual_seed(0)
>>> module = ExampleModule(torch.rand(3, 4))
>>> module.weight #doctest: +ELLIPSIS
tensor([[...]])
>>> module.to(torch.double)
ExampleModule()
>>> module.weight #doctest: +ELLIPSIS
tensor([[...]], dtype=torch.float64)
>>> cpu = torch.device('cpu')
>>> module.to(cpu, dtype=torch.half, non_blocking=True)
ExampleModule()
>>> module.weight #doctest: +ELLIPSIS
tensor([[...]], dtype=torch.float16)
>>> module.to(cpu)
ExampleModule()
>>> module.weight #doctest: +ELLIPSIS
tensor([[...]], dtype=torch.float16)

View Source

    def to(self, *args, **kwargs) -> Module:

        """Moves and/or casts the parameters and buffers.

        This can be called as

        .. function:: to(device=None, dtype=None, non_blocking=False)

        .. function:: to(dtype, non_blocking=False)

        .. function:: to(tensor, non_blocking=False)

        Its signature is similar to :meth:`torch.Tensor.to`, but only accepts

        floating point desired :attr:`dtype` s. In addition, this method will

        only cast the floating point parameters and buffers to :attr:`dtype`

        (if given). The integral parameters and buffers will be moved

        :attr:`device`, if that is given, but with dtypes unchanged. When

        :attr:`non_blocking` is set, it tries to convert/move asynchronously

        with respect to the host if possible, e.g., moving CPU Tensors with

        pinned memory to CUDA devices.

        See below for examples.

        Note:

            This method modifies the module in-place.

        Args:

            device: the desired device of the parameters

                and buffers in this module

            dtype: the desired floating point type of

                the floating point parameters and buffers in this module

            tensor: Tensor whose dtype and device are the desired

                dtype and device for all parameters and buffers in this module

        Returns:

            Module: self

        Example::

            >>> class ExampleModule(DeviceDtypeModuleMixin):

            ...     def __init__(self, weight: torch.Tensor):

            ...         super().__init__()

            ...         self.register_buffer('weight', weight)

            >>> _ = torch.manual_seed(0)

            >>> module = ExampleModule(torch.rand(3, 4))

            >>> module.weight #doctest: +ELLIPSIS

            tensor([[...]])

            >>> module.to(torch.double)

            ExampleModule()

            >>> module.weight #doctest: +ELLIPSIS

            tensor([[...]], dtype=torch.float64)

            >>> cpu = torch.device('cpu')

            >>> module.to(cpu, dtype=torch.half, non_blocking=True)

            ExampleModule()

            >>> module.weight #doctest: +ELLIPSIS

            tensor([[...]], dtype=torch.float16)

            >>> module.to(cpu)

            ExampleModule()

            >>> module.weight #doctest: +ELLIPSIS

            tensor([[...]], dtype=torch.float16)

        """

        # there is diff nb vars in PT 1.5

        out = torch._C._nn._parse_to(*args, **kwargs)

        device = out[0]

        dtype = out[1]

        if device is not None:

            self._device = device

        if dtype is not None:

            self._dtype = dtype

        return super().to(*args, **kwargs)

train

def train(
    self: ~T,
    mode: bool = True
) -> ~T

Sets the module in training mode.

This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:Dropout, :class:BatchNorm, etc.

Parameters:

Name	Type	Description	Default
mode	bool	whether to set training mode (`True`) or evaluation
mode (`False`). Default: `True`.	None

Returns:

Type	Description
Module	self

View Source

    def train(self: T, mode: bool = True) -> T:

        r"""Sets the module in training mode.

        This has any effect only on certain modules. See documentations of

        particular modules for details of their behaviors in training/evaluation

        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,

        etc.

        Args:

            mode (bool): whether to set training mode (``True``) or evaluation

                         mode (``False``). Default: ``True``.

        Returns:

            Module: self

        """

        self.training = mode

        for module in self.children():

            module.train(mode)

        return self

train_dataloader

def train_dataloader(
    self
)

View Source

    def train_dataloader(self):

        train_loader = DataLoader(

            dataset=self.train_dataset,

            batch_size=self.batch_size,

            collate_fn=self.train_dataset.collate_fn,

            shuffle=True,

            num_workers=cpu_count(),

        )

        return train_loader

training_end

def training_end(
    self,
    *args,
    **kwargs
)

Warnings:

Deprecated in v0.7.0. Use :meth:training_step_end instead.

View Source

    def training_end(self, *args, **kwargs):

        """

        Warnings:

            Deprecated in v0.7.0. Use  :meth:`training_step_end` instead.

        """

training_epoch_end

def training_epoch_end(
    self,
    outputs: Union[List[Dict[str, torch.Tensor]], List[List[Dict[str, torch.Tensor]]]]
) -> Dict[str, Dict[str, torch.Tensor]]

Called at the end of the training epoch with the outputs of all training steps.

.. code-block:: python

# the pseudocode for these calls
train_outs = []
for train_batch in train_data:
    out = training_step(train_batch)
    train_outs.append(out)
training_epoch_end(train_outs)

Args: outputs: List of outputs you defined in :meth:training_step, or if there are multiple dataloaders, a list containing a list of outputs for each dataloader.

Return: Dict or OrderedDict. May contain the following optional keys:

- log (metrics to be added to the logger; only tensors)
- progress_bar (dict for progress bar display)
- any metric used in a callback (e.g. early stopping).

Note: If this method is not overridden, this won't be called.

The outputs here are strictly for logging or progress bar.
If you don't need to display anything, don't return anything.
If you want to manually set current step, you can specify the 'step' key in the 'log' dict.

Examples: With a single dataloader:

.. code-block:: python

    def training_epoch_end(self, outputs):
        train_acc_mean = 0
        for output in outputs:
            train_acc_mean += output['train_acc']

        train_acc_mean /= len(outputs)

        # log training accuracy at the end of an epoch
        results = {
            'log': {'train_acc': train_acc_mean.item()},
            'progress_bar': {'train_acc': train_acc_mean},
        }
        return results

With multiple dataloaders, ``outputs`` will be a list of lists. The outer list contains
one entry per dataloader, while the inner list contains the individual outputs of
each training step for that dataloader.

.. code-block:: python

    def training_epoch_end(self, outputs):
        train_acc_mean = 0
        i = 0
        for dataloader_outputs in outputs:
            for output in dataloader_outputs:
                train_acc_mean += output['train_acc']
                i += 1

        train_acc_mean /= i

        # log training accuracy at the end of an epoch
        results = {
            'log': {'train_acc': train_acc_mean.item(), 'step': self.current_epoch}
            'progress_bar': {'train_acc': train_acc_mean},
        }
        return results

View Source

    def training_epoch_end(

            self,

            outputs: Union[List[Dict[str, Tensor]], List[List[Dict[str, Tensor]]]]

    ) -> Dict[str, Dict[str, Tensor]]:

        """Called at the end of the training epoch with the outputs of all training steps.

        .. code-block:: python

            # the pseudocode for these calls

            train_outs = []

            for train_batch in train_data:

                out = training_step(train_batch)

                train_outs.append(out)

            training_epoch_end(train_outs)

        Args:

            outputs: List of outputs you defined in :meth:`training_step`, or if there are

                multiple dataloaders, a list containing a list of outputs for each dataloader.

        Return:

            Dict or OrderedDict.

            May contain the following optional keys:

            - log (metrics to be added to the logger; only tensors)

            - progress_bar (dict for progress bar display)

            - any metric used in a callback (e.g. early stopping).

        Note:

            If this method is not overridden, this won't be called.

        - The outputs here are strictly for logging or progress bar.

        - If you don't need to display anything, don't return anything.

        - If you want to manually set current step, you can specify the 'step' key in the 'log' dict.

        Examples:

            With a single dataloader:

            .. code-block:: python

                def training_epoch_end(self, outputs):

                    train_acc_mean = 0

                    for output in outputs:

                        train_acc_mean += output['train_acc']

                    train_acc_mean /= len(outputs)

                    # log training accuracy at the end of an epoch

                    results = {

                        'log': {'train_acc': train_acc_mean.item()},

                        'progress_bar': {'train_acc': train_acc_mean},

                    }

                    return results

            With multiple dataloaders, ``outputs`` will be a list of lists. The outer list contains

            one entry per dataloader, while the inner list contains the individual outputs of

            each training step for that dataloader.

            .. code-block:: python

                def training_epoch_end(self, outputs):

                    train_acc_mean = 0

                    i = 0

                    for dataloader_outputs in outputs:

                        for output in dataloader_outputs:

                            train_acc_mean += output['train_acc']

                            i += 1

                    train_acc_mean /= i

                    # log training accuracy at the end of an epoch

                    results = {

                        'log': {'train_acc': train_acc_mean.item(), 'step': self.current_epoch}

                        'progress_bar': {'train_acc': train_acc_mean},

                    }

                    return results

        """

training_step

def training_step(
    self,
    batch,
    batch_idx
)

View Source

    def training_step(self, batch, batch_idx):

        images, targets, _ = batch

        if self.train_rpn:

            targets = [{"boxes": t["boxes"]} for t in targets]

            _, loss_dict = self(images, targets=targets)

            loss = sum(loss for loss in loss_dict.values())

            return OrderedDict({"loss": loss, "progress_bar": loss_dict, "log": loss_dict})

        elif self.train_roi:

            _, loss_dict = self(images, targets=targets)

            loss = sum(loss for loss in loss_dict.values())

            return OrderedDict({"loss": loss, "progress_bar": loss_dict, "log": loss_dict})

        else:

            images = list(image for image in images)

            targets = [{k: v for k, v in t.items()} for t in targets]

            loss_dict = self(images, targets=targets)

            # loss keys: ['loss_classifier', 'loss_box_reg', 'loss_objectness', 'loss_rpn_box_reg']

            loss = sum(loss for loss in loss_dict.values())

            if not math.isfinite(loss.item()):

                sys.exit(1)

            return OrderedDict({"loss": loss, "progress_bar": loss_dict, "log": loss_dict})

training_step_end

def training_step_end(
    self,
    *args,
    **kwargs
) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]

Use this when training with dp or ddp2 because :meth:training_step will operate on only part of the batch. However, this is still optional and only needed for things like softmax or NCE loss.

Note: If you later switch to ddp or some other mode, this will still be called so that you don't have to change your code

.. code-block:: python

# pseudocode
sub_batches = split_batches_for_dp(batch)
batch_parts_outputs = [training_step(sub_batch) for sub_batch in sub_batches]
training_step_end(batch_parts_outputs)

Args: batch_parts_outputs: What you return in training_step for each batch part.

Return: Dict with loss key and optional log or progress bar keys.

- loss -> tensor scalar **REQUIRED**
- progress_bar -> Dict for progress bar display. Must have only tensors
- log -> Dict of metrics to add to logger. Must have only tensors (no images, etc)

Examples: .. code-block:: python

    # WITHOUT training_step_end
    # if used in DP or DDP2, this batch is 1/num_gpus large
    def training_step(self, batch, batch_idx):
        # batch is 1/num_gpus big
        x, y = batch

        out = self(x)
        loss = self.softmax(out)
        loss = nce_loss(loss)
        return {'loss': loss}

    # --------------
    # with training_step_end to do softmax over the full batch
    def training_step(self, batch, batch_idx):
        # batch is 1/num_gpus big
        x, y = batch

        out = self(x)
        return {'out': out}

    def training_step_end(self, outputs):
        # this out is now the full size of the batch
        out = outputs['out']

        # this softmax now uses the full batch size
        loss = nce_loss(loss)
        return {'loss': loss}

See Also: See the :ref:multi-gpu-training guide for more details.

View Source

    def training_step_end(self, *args, **kwargs) -> Dict[

        str, Union[Tensor, Dict[str, Tensor]]

    ]:

        """

        Use this when training with dp or ddp2 because :meth:`training_step`

        will operate on only part of the batch. However, this is still optional

        and only needed for things like softmax or NCE loss.

        Note:

            If you later switch to ddp or some other mode, this will still be called

            so that you don't have to change your code

        .. code-block:: python

            # pseudocode

            sub_batches = split_batches_for_dp(batch)

            batch_parts_outputs = [training_step(sub_batch) for sub_batch in sub_batches]

            training_step_end(batch_parts_outputs)

        Args:

            batch_parts_outputs: What you return in `training_step` for each batch part.

        Return:

            Dict with loss key and optional log or progress bar keys.

            - loss -> tensor scalar **REQUIRED**

            - progress_bar -> Dict for progress bar display. Must have only tensors

            - log -> Dict of metrics to add to logger. Must have only tensors (no images, etc)

        Examples:

            .. code-block:: python

                # WITHOUT training_step_end

                # if used in DP or DDP2, this batch is 1/num_gpus large

                def training_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    loss = self.softmax(out)

                    loss = nce_loss(loss)

                    return {'loss': loss}

                # --------------

                # with training_step_end to do softmax over the full batch

                def training_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    return {'out': out}

                def training_step_end(self, outputs):

                    # this out is now the full size of the batch

                    out = outputs['out']

                    # this softmax now uses the full batch size

                    loss = nce_loss(loss)

                    return {'loss': loss}

        See Also:

            See the :ref:`multi-gpu-training` guide for more details.

        """

transfer_batch_to_device

def transfer_batch_to_device(
    self,
    batch: Any,
    device: torch.device
) -> Any

Override this hook if your :class:~torch.utils.data.DataLoader returns tensors

wrapped in a custom data structure.

The data types listed below (and any arbitrary nesting of them) are supported out of the box:

:class:torch.Tensor or anything that implements .to(...)
:class:list
:class:dict
:class:tuple
:class:torchtext.data.batch.Batch

For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).

Example::

def transfer_batch_to_device(self, batch, device)
    if isinstance(batch, CustomBatch):
        # move all tensors in your custom data structure to the device
        batch.samples = batch.samples.to(device)
        batch.targets = batch.targets.to(device)
    else:
        batch = super().transfer_batch_to_device(data, device)
    return batch

Parameters:

Name	Type	Description	Default
batch	None	A batch of data that needs to be transferred to a new device.	None
device	None	The target device as defined in PyTorch.	None

Returns:

Type	Description
None	A reference to the data on the new device.

Note: This hook should only transfer the data and not modify it, nor should it move the data to any other device than the one passed in as argument (unless you know what you are doing). The :class:~pytorch_lightning.trainer.trainer.Trainer already takes care of splitting the batch and determines the target devices.

See Also: - :func:~pytorch_lightning.utilities.apply_func.move_data_to_device - :func:~pytorch_lightning.utilities.apply_func.apply_to_collection |

View Source

    def transfer_batch_to_device(self, batch: Any, device: torch.device) -> Any:

        """

        Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors

        wrapped in a custom data structure.

        The data types listed below (and any arbitrary nesting of them) are supported out of the box:

        - :class:`torch.Tensor` or anything that implements `.to(...)`

        - :class:`list`

        - :class:`dict`

        - :class:`tuple`

        - :class:`torchtext.data.batch.Batch`

        For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).

        Example::

            def transfer_batch_to_device(self, batch, device)

                if isinstance(batch, CustomBatch):

                    # move all tensors in your custom data structure to the device

                    batch.samples = batch.samples.to(device)

                    batch.targets = batch.targets.to(device)

                else:

                    batch = super().transfer_batch_to_device(data, device)

                return batch

        Args:

            batch: A batch of data that needs to be transferred to a new device.

            device: The target device as defined in PyTorch.

        Returns:

            A reference to the data on the new device.

        Note:

            This hook should only transfer the data and not modify it, nor should it move the data to

            any other device than the one passed in as argument (unless you know what you are doing).

            The :class:`~pytorch_lightning.trainer.trainer.Trainer` already takes care of splitting the

            batch and determines the target devices.

        See Also:

            - :func:`~pytorch_lightning.utilities.apply_func.move_data_to_device`

            - :func:`~pytorch_lightning.utilities.apply_func.apply_to_collection`

        """

        return move_data_to_device(batch, device)

type

def type(
    self,
    dst_type: Union[str, torch.dtype]
) -> torch.nn.modules.module.Module

Casts all parameters and buffers to :attr:dst_type.

Parameters:

Name	Type	Description	Default
dst_type	type or string	the desired type	None

Returns:

Type	Description
Module	self

View Source

    def type(self, dst_type: Union[str, torch.dtype]) -> Module:

        """Casts all parameters and buffers to :attr:`dst_type`.

        Arguments:

            dst_type (type or string): the desired type

        Returns:

            Module: self

        """

        self._dtype = dst_type

        return super().type(dst_type=dst_type)

unfreeze

def unfreeze(
    self
) -> None

Unfreeze all parameters for training.

.. code-block:: python

model = MyLightningModule(...)
model.unfreeze()

View Source

    def unfreeze(self) -> None:

        """

        Unfreeze all parameters for training.

        .. code-block:: python

            model = MyLightningModule(...)

            model.unfreeze()

        """

        for param in self.parameters():

            param.requires_grad = True

        self.train()

val_dataloader

def val_dataloader(
    self
)

View Source

    def val_dataloader(self):

        val_loader = DataLoader(

            self.val_dataset,

            batch_size=self.batch_size,

            collate_fn=self.val_dataset.collate_fn,

            shuffle=False,

            num_workers=cpu_count() // 2,

        )

        self.coco_evaluator = self._get_evaluator(val_loader.dataset)

        return val_loader

validation_end

def validation_end(
    self,
    outputs
)

Warnings:

Deprecated in v0.7.0. Use :meth:validation_epoch_end instead. Will be removed in 1.0.0.

View Source

    def validation_end(self, outputs):

        """

        Warnings:

            Deprecated in v0.7.0. Use :meth:`validation_epoch_end` instead.

            Will be removed in 1.0.0.

        """

validation_epoch_end

def validation_epoch_end(
    self,
    outputs
)

View Source

    @auto_move_data

    def validation_epoch_end(self, outputs):

        if self.train_rpn:

            return {}

        elif self.train_roi:

            # TODO: above

            return {}

        else:

            self.coco_evaluator.synchronize_between_processes()

            self.coco_evaluator.accumulate()

            self.coco_evaluator.summarize()

            metric = self.coco_evaluator.coco_eval["bbox"].stats[0]

            metric = torch.as_tensor(metric)

            tensorboard_logs = {"main_score": metric}

            self.coco_evaluator = self._get_evaluator(self.val_dataset)  # need to update for the new evaluation

            return {"val_loss": metric, "log": tensorboard_logs, "progress_bar": tensorboard_logs}

validation_step

def validation_step(
    self,
    batch,
    batch_idx
)

View Source

    @auto_move_data

    def validation_step(self, batch, batch_idx):

        images, targets, _ = batch

        if self.train_rpn:  # rpn doesn't compute loss for val

            return {}

        elif self.train_roi:

            # TODO: scores are predictions scores, not a metric! iou? + acc?

            return {}

        else:

            images = list(image for image in images)

            targets = [{k: v for k, v in t.items()} for t in targets]

            outputs = self(images, targets=targets)

            ret = {target["image_id"].item(): output for target, output in zip(targets, outputs)}

            self.coco_evaluator.update(ret)

            return {}

validation_step_end

def validation_step_end(
    self,
    *args,
    **kwargs
) -> Dict[str, torch.Tensor]

Use this when validating with dp or ddp2 because :meth:validation_step

will operate on only part of the batch. However, this is still optional and only needed for things like softmax or NCE loss.

Note: If you later switch to ddp or some other mode, this will still be called so that you don't have to change your code.

.. code-block:: python

# pseudocode
sub_batches = split_batches_for_dp(batch)
batch_parts_outputs = [validation_step(sub_batch) for sub_batch in sub_batches]
validation_step_end(batch_parts_outputs)

Parameters:

Name	Type	Description	Default
batch_parts_outputs	None	What you return in :meth:`validation_step`
for each batch part.

Return: Dict or OrderedDict - passed to the :meth:validation_epoch_end method. | None |

View Source

    def validation_step_end(self, *args, **kwargs) -> Dict[str, Tensor]:

        """

        Use this when validating with dp or ddp2 because :meth:`validation_step`

        will operate on only part of the batch. However, this is still optional

        and only needed for things like softmax or NCE loss.

        Note:

            If you later switch to ddp or some other mode, this will still be called

            so that you don't have to change your code.

        .. code-block:: python

            # pseudocode

            sub_batches = split_batches_for_dp(batch)

            batch_parts_outputs = [validation_step(sub_batch) for sub_batch in sub_batches]

            validation_step_end(batch_parts_outputs)

        Args:

            batch_parts_outputs: What you return in :meth:`validation_step`

                for each batch part.

        Return:

           Dict or OrderedDict - passed to the :meth:`validation_epoch_end` method.

        Examples:

            .. code-block:: python

                # WITHOUT validation_step_end

                # if used in DP or DDP2, this batch is 1/num_gpus large

                def validation_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    loss = self.softmax(out)

                    loss = nce_loss(loss)

                    return {'loss': loss}

                # --------------

                # with validation_step_end to do softmax over the full batch

                def validation_step(self, batch, batch_idx):

                    # batch is 1/num_gpus big

                    x, y = batch

                    out = self(x)

                    return {'out': out}

                def validation_epoch_end(self, outputs):

                    # this out is now the full size of the batch

                    out = outputs['out']

                    # this softmax now uses the full batch size

                    loss = nce_loss(loss)

                    return {'loss': loss}

        See Also:

            See the :ref:`multi-gpu-training` guide for more details.

        """

zero_grad

def zero_grad(
    self
) -> None

Sets gradients of all model parameters to zero.

View Source

    def zero_grad(self) -> None:

        r"""Sets gradients of all model parameters to zero."""

        if getattr(self, '_is_replica', False):

            warnings.warn(

                "Calling .zero_grad() from a module created with nn.DataParallel() has no effect. "

                "The parameters are copied (in a differentiable manner) from the original module. "

                "This means they are not leaf nodes in autograd and so don't accumulate gradients. "

                "If you need gradients in your forward method, consider using autograd.grad instead.")

        for p in self.parameters():

            if p.grad is not None:

                p.grad.detach_()

                p.grad.zero_()