123-牛顿法

牛顿迭代法:

1. 求解方程$f(x) =0$

在$x_0$附近使用一阶Taylor级数展开$$f(x) = f(x_0) + f^{‘}(x_0)(x - x_0) + 高阶项 \approx f(x_0) + f^{‘}(x_0)(x - x_0) $$
在$x_0$处求导得到: $$x_{n+1} = x_n - \cfrac{f(n)}{f^{‘}(x)}$$
经过迭代上述式子会在$f(x)=0$处收敛。

2. 求极大极小值的最优化问题

在$x_0$附近使用二阶Taylor级数展开$$f(x) = f(x_0) + f^{‘}(x_0)(x - x_0) + \cfrac{1}{2}f^{‘’}(x-x_0)^2 + 高阶项$$
由于函数在其极值点有一阶导数为0,可以将此最优化问题转换为求解$f^{‘}(x)=0$方程的问题。
所以,下面的方程的解就是要找的极值点$$f^{‘}(x_0) + f^{‘’}(x - x_0)=0$$
通过使用方法一中的迭代法就可以求得该极值点x

1
2
3
4
5
6
7
8
9
10
11
12
13
import torch
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

x = torch.linspace(-10, 10, 2000)
y = - x**3 + 5*x**2 - 9*x + 26

plt.figure(figsize=(20,10))

plt.plot(x.numpy(), y.numpy())
plt.grid()
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
s = torch.tensor(-7.0, requires_grad = True)

def func(s):
return -s**3 + 5*s**2 - 9*s + 26


step = 0


while abs(func(s)) > 1e-3:
step +=1
f = func(s)
f.backward(retain_graph=True)

with torch.no_grad():
grd = s.grad

if grd != 0:
yy = grd * x + f - grd * s

plt.figure(figsize=(10,6))
plt.plot(x.numpy(), y.numpy(), x.numpy(),yy.numpy())
plt.grid()
plt.legend(["f(x) = -x^3 + 5x^2 - 9x + 26", "y = {}x + {}".format(grd, f-grd*s)])
plt.title(["step={}, error={}, x={}, y={}".format(step, abs(f), s, f)])
plt.show()

s -= f/grd
s.grad.zero_()
s.detach().numpy(),f.detach().numpy()

png

png

png

png

png

png

png

png

png

png

png

(array(4.311271, dtype=float32), array(-0.00435638, dtype=float32))

等高线

1
2
3
4
5
6
def f(x , y):
return (1 - 0.5 * x + x**5 +y**3) * np.exp(-x**2 - y**2)

n = 256
xx = np.linspace(-3, 3, n)
yy = np.linspace(-3, 3, n)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def plot_contour(x, y, func, xpoint = None, ypoint = None, density = 20, title=""):
x, y = np.meshgrid(x,y)

plt.figure(figsize=(10 , 6))
# plt.contourf(x, y, f(x,y))

# plt.contourf(x, y, f(x, y), cmap = plt.cm.hot)#热力填充

#添加等高线,20表示的是等高线的密集程度
c = plt.contour(x, y, func(x,y), 20)
# plt.clabel(c, inline=True, fontsize=10)

if (xpoint is not None) and (ypoint is not None):
plt.scatter(xpoint, ypoint)
plt.plot(xpoint, ypoint, "r")
plt.title(title)
plt.show()
# plot_contour(xx,yy,f)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
x = torch.tensor(-2, dtype = torch.float32, requires_grad = True)
y = torch.tensor(2, dtype = torch.float32, requires_grad = True)


xp = []
yp = []
lr = 0.5

for i in range(1000):
if i %100 == 0:
xp.append(x.item())
yp.append(y.item())

z = (1 - 0.5 * x + x**5 +y**3) * torch.exp(-x**2 - y**2)
z.backward()
with torch.no_grad():
x -= lr * x.grad
y -= lr * y.grad
x.grad.zero_()
y.grad.zero_()


plot_contour(xx, yy, f, xp, yp, func)

xp, yp

png

([-2.0,
  -1.6888387203216553,
  -1.688839077949524,
  -1.6888388395309448,
  -1.6888387203216553,
  -1.688839077949524,
  -1.6888388395309448,
  -1.6888387203216553,
  -1.688839077949524,
  -1.6888388395309448],
 [2.0,
  4.168582671673466e-41,
  1.401298464324817e-45,
  1.401298464324817e-45,
  1.401298464324817e-45,
  1.401298464324817e-45,
  1.401298464324817e-45,
  1.401298464324817e-45,
  1.401298464324817e-45,
  1.401298464324817e-45])

梯度下降算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def func_1(x, y):
return 2.0*x**2 +2 *y**2

def func_2(x, y):
return 0.1 *x**2 +2 *y**2

def func_3(x, y):
return 1.0*x**2 +2 *y**2


def compare_gradient_descent(lr , func):
xx = np.linspace(-4, 4, n)
yy = np.linspace(-4, 4, n)

xp = []
yp = []

x = torch.tensor(-3, dtype=torch.float32, requires_grad = True)
y = torch.tensor(3, dtype=torch.float32, requires_grad = True)

for i in range(20):
xp.append(x.item())
yp.append(y.item())
z = func(x, y)
z.backward()

with torch.no_grad():
x -= lr * x.grad
y -= lr * y.grad

x.grad.zero_()
y.grad.zero_()

plot_contour(xx, yy, func, xp, yp, title="learning_rate={}, func={}".format(lr, func))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(15,15))
# ax = fig.add_subplot(111, projection='3d')
# ax.plot(xx, yy, func_1(xx, yy))

ax = fig.gca(projection='3d')

x, y = np.meshgrid(xx,yy)
z = func_1(x, y)
surf = ax.plot_surface(x, y, z, cmap=plt.cm.coolwarm, linewidth=0, antialiased=False)

ax.set_zlim(0.01, 64.01)
# ax.zaxis.set_major_locator(LinearLocator(10))
# ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))

# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)

plt.show()
C:\Users\user\AppData\Local\Temp/ipykernel_5896/3174185759.py:7: MatplotlibDeprecationWarning: Calling gca() with keyword arguments was deprecated in Matplotlib 3.4. Starting two minor releases later, gca() will take no keyword arguments. The gca() function should only be used to get the current axes, or if no axes exist, create new axes with default keyword arguments. To create a new axes with non-default arguments, use plt.axes() or plt.subplot().
  ax = fig.gca(projection='3d')

png

1
2
ax.plot_wireframe(x, y, z, rstride=10, cstride=10)
plt.show()

learning_rate大小选择

  • learning_rate增大可以加快收敛速度
  • learning_rate太大不一定收敛更快(可能会振荡)
1
2
3
compare_gradient_descent(0.1, func_1)
compare_gradient_descent(0.2, func_1)
compare_gradient_descent(0.4, func_1)

png

png

png

不同维度量纲不同,导致收敛速度不一样

1
2
compare_gradient_descent(0.1, func_3)
compare_gradient_descent(0.4, func_3)

png

png

1
2


不同维度量纲不同,导致不同维度收敛速度不一致

  • learnging适中:有的维度已经收敛,另外维度还没有收敛
  • learning太小:都无法收敛
  • learning太大:慢速维度收敛,快速维度振荡发散
1
2
3
compare_gradient_descent(0.4, func_2)
compare_gradient_descent(0.1, func_2)
compare_gradient_descent(0.6, func_2)

png

png

png

动量法

https://distill.pub/2017/momentum/