Source code for slapo.op.fused_bias

# Copyright, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""Fuse bias with the subsequent ops, such as activation function or dropout."""
# pylint: disable=abstract-method
from __future__ import annotations

import math

import torch
from torch.nn import functional as F

[docs]class BiasGeLUFunction(torch.autograd.Function): """Bias+GeLU. Copied from Megatron-LM.""" # pylint: disable=no-self-argument, arguments-differ @torch.jit.script def bias_gelu(bias, y): x = bias + y return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) # gradient of tanh approximation of gelu # gradient of actual gelu is: # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) @torch.jit.script def bias_gelu_back(g, bias, y): x = bias + y tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 ff = 0.5 * x * ( (1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x) ) + 0.5 * (1 + tanh_out) return ff * g
[docs] @staticmethod # bias is an optional argument def forward(ctx, inp, bias): ctx.save_for_backward(inp, bias) return BiasGeLUFunction.bias_gelu(bias, inp)
[docs] @staticmethod def backward(ctx, grad_output): inp, bias = ctx.saved_tensors tmp = BiasGeLUFunction.bias_gelu_back(grad_output, bias, inp) return tmp, tmp
[docs]def new_gelu(inp): """New GELU activation function copied from HuggingFace transformers.""" return ( 0.5 * inp * ( 1.0 + torch.tanh( math.sqrt(2.0 / math.pi) * (inp + 0.044715 * torch.pow(inp, 3.0)) ) ) )
def bias_new_gelu(inp: torch.Tensor, bias: torch.Tensor) -> torch.Tensor: return new_gelu(inp + bias) def bias_dropout( x: torch.Tensor, bias: torch.Tensor, p: float = 0.5, training: bool = True, inplace: bool = False, ) -> torch.Tensor: return F.dropout(x + bias, p=p, training=training, inplace=inplace)