国内最好的网站建设公司,wordpress如何ftp,外贸网站需要备案吗,宁波外贸网站设计公司看完这篇#xff0c;你基本上可以自定义前向与反向传播#xff0c;可以自己定义自己的算子 文章目录Tanh公式求导过程优点#xff1a;缺点#xff1a;自定义Tanh与Torch定义的比较可视化import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import torc…看完这篇你基本上可以自定义前向与反向传播可以自己定义自己的算子 文章目录Tanh公式求导过程优点缺点自定义Tanh与Torch定义的比较可视化import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F%matplotlib inlineplt.rcParams[figure.figsize] (7, 3.5)
plt.rcParams[figure.dpi] 150
plt.rcParams[axes.unicode_minus] False #解决坐标轴负数的铅显示问题Tanh
公式
tanh(x)sinh(x)cosh(x)ex−e−xexe−x\tanh(x) \frac{\sinh(x)}{\cosh(x)} \frac{e^x - e^{-x}}{e^x e^{-x}}tanh(x)cosh(x)sinh(x)exe−xex−e−x
tanh(x)2σ(2x)−1\tanh(x) 2 \sigma(2x) - 1 tanh(x)2σ(2x)−1
求导过程
tanh′(x)(ex−e−xexe−x)′[(ex−e−x)(exe−x)−1]′(exe−x)(exe−x)−1(ex−e−x)(−1)(exe−x)−2(ex−e−x)1−(ex−e−x)2(exe−x)−21−(ex−e−x)2(exe−x)21−tanh2(x)\begin{aligned} \tanh(x) \big(\frac{e^x - e^{-x}}{e^x e^{-x}}\big) \\ \big[(e^x - e^{-x})(e^x e^{-x})^{-1}\big] \\ (e^x e^{-x})(e^x e^{-x})^{-1} (e^x - e^{-x})(-1)(e^x e^{-x})^{-2} (e^x - e^{-x}) \\ 1-(e^x - e^{-x})^2(e^x e^{-x})^{-2} \\ 1 - \frac{(e^x - e^{-x})^2}{(e^x e^{-x})^2} \\ 1- \tanh^2(x) \\ \end{aligned}tanh′(x)(exe−xex−e−x)′[(ex−e−x)(exe−x)−1]′(exe−x)(exe−x)−1(ex−e−x)(−1)(exe−x)−2(ex−e−x)1−(ex−e−x)2(exe−x)−21−(exe−x)2(ex−e−x)21−tanh2(x)
优点
Tanh也称为双切正切函数取值范围为[-1,1]。tanh在特征相差明显时的效果会很好在循环过程中会不断扩大特征效果。与 sigmoid 的区别是tanh 是 0 均值的因此实际应用中 tanh 会比 sigmoid 更好。文献 [LeCun, Y., et al., Backpropagation applied to handwritten zip code recognition. Neural computation, 1989. 1(4): p. 541-551.] 中提到tanh 网络的收敛速度要比sigmoid快因为tanh 的输出均值比 sigmoid 更接近 0SGD会更接近 natural gradient[4]一种二次优化技术从而降低所需的迭代次数。非常优秀几乎适合所有的场景
缺点
该导数在正负饱和区的梯度都会接近于0值会造成梯度消失。还有其更复杂的幂运算。
自定义Tanh
class SelfDefinedTanh(torch.autograd.Function):staticmethoddef forward(ctx, inp):exp_x torch.exp(inp)exp_x_ torch.exp(-inp)result torch.divide((exp_x - exp_x_), (exp_x exp_x_))ctx.save_for_backward(result)return resultstaticmethoddef backward(ctx, grad_output):# ctx.saved_tensors is tuple (tensors, grad_fn)result, ctx.saved_tensorsreturn grad_output * (1 - result.pow(2))class Tanh(nn.Module):def __init__(self):super().__init__()def forward(self, x):out SelfDefinedTanh.apply(x)return outdef tanh_sigmoid(x):according to the equation# 2 * torch.sigmoid(2 * x) -1 return torch.mul(torch.sigmoid(torch.mul(x, 2)), 2) - 1与Torch定义的比较
# self defined
torch.manual_seed(0)tanh Tanh() # SelfDefinedTanh
inp torch.randn(5, requires_gradTrue)
out tanh((inp 1).pow(2))print(fOut is\n{out})out.backward(torch.ones_like(inp), retain_graphTrue)
print(f\nFirst call\n{inp.grad})out.backward(torch.ones_like(inp), retain_graphTrue)
print(f\nSecond call\n{inp.grad})inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graphTrue)
print(f\nCall after zeroing gradients\n{inp.grad})Out is
tensor([1.0000, 0.4615, 0.8831, 0.9855, 0.0071],grad_fnSelfDefinedTanhBackward)First call
tensor([ 5.0889e-05, 1.1121e00, -5.1911e-01, 9.0267e-02, -1.6904e-01])Second call
tensor([ 1.0178e-04, 2.2243e00, -1.0382e00, 1.8053e-01, -3.3807e-01])Call after zeroing gradients
tensor([ 5.0889e-05, 1.1121e00, -5.1911e-01, 9.0267e-02, -1.6904e-01])# self defined tanh_sigmoid
torch.manual_seed(0)inp torch.randn(5, requires_gradTrue)
out tanh_sigmoid((inp 1).pow(2))print(fOut is\n{out})out.backward(torch.ones_like(inp), retain_graphTrue)
print(f\nFirst call\n{inp.grad})out.backward(torch.ones_like(inp), retain_graphTrue)
print(f\nSecond call\n{inp.grad})inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graphTrue)
print(f\nCall after zeroing gradients\n{inp.grad})Out is
tensor([1.0000, 0.4615, 0.8831, 0.9855, 0.0071], grad_fnSubBackward0)First call
tensor([ 5.0889e-05, 1.1121e00, -5.1911e-01, 9.0267e-02, -1.6904e-01])Second call
tensor([ 1.0178e-04, 2.2243e00, -1.0382e00, 1.8053e-01, -3.3807e-01])Call after zeroing gradients
tensor([ 5.0889e-05, 1.1121e00, -5.1911e-01, 9.0267e-02, -1.6904e-01])# torch defined
torch.manual_seed(0)inp torch.randn(5, requires_gradTrue)
out torch.tanh((inp 1).pow(2))print(fOut is\n{out})out.backward(torch.ones_like(inp), retain_graphTrue)
print(f\nFirst call\n{inp.grad})out.backward(torch.ones_like(inp), retain_graphTrue)
print(f\nSecond call\n{inp.grad})inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graphTrue)
print(f\nCall after zeroing gradients\n{inp.grad})Out is
tensor([1.0000, 0.4615, 0.8831, 0.9855, 0.0071], grad_fnTanhBackward)First call
tensor([ 5.0283e-05, 1.1121e00, -5.1911e-01, 9.0267e-02, -1.6904e-01])Second call
tensor([ 1.0057e-04, 2.2243e00, -1.0382e00, 1.8053e-01, -3.3807e-01])Call after zeroing gradients
tensor([ 5.0283e-05, 1.1121e00, -5.1911e-01, 9.0267e-02, -1.6904e-01])从上3个结果可以看出不管是经过sigmoid来计算还是公式定义都可以得到一样的output与gradient。但在输入的值较大时torch应该是减去一个小值使得梯度更小。
可视化
# visualization
inp torch.arange(-8, 8, 0.1, requires_gradTrue)
out tanh(inp)
out.sum().backward()inp_grad inp.gradplt.plot(inp.detach().numpy(),out.detach().numpy(),labelr$\tanh(x)$,alpha0.7)
plt.plot(inp.detach().numpy(),inp_grad.numpy(),labelr$\tanh(x)$,alpha0.5)
plt.grid()
plt.legend()
plt.show()