122-RegressionTree原理和实现

Regression Tree 回归树

1
2
3
4
5
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import seaborn as snn

1. 回归树原理和实现

回归树(Regression Tree)的目标也是经验风险最小化,对于某个特征来说,我们希望以某个标准将进行划分,使得以下loss最小:$$\underset{D_1}{\sum}(y_i - c_1)^2 + \underset{D_2}{\sum}(y_i - c_2)^2$$
其中,$$\begin{cases}c_1 = \cfrac{\sum y_i}{\parallel D_1\parallel}\\c_2 = \cfrac{\sum y_i}{\parallel D_2\parallel}\end{cases}$$

2. 多特征回归树

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from mpl_toolkits.mplot3d import Axes3D

def func(x, y):
return (0.2*x*x + 0.2*y*y-5)

x = np.linspace(-10, 10, 100).reshape((-1, 1))
y = np.linspace(-10, 10, 100).reshape((-1, 1))
z = func(x, y)

fig = plt.figure(figsize=(15,15))

ax = fig.gca(projection='3d')

x, y = np.meshgrid(x,y)
z = func(x, y)
surf = ax.plot_surface(x, y, z, cmap=plt.cm.coolwarm, linewidth=0, antialiased=False)

# ax.set_zlim(0.01, 64.01)
# ax.zaxis.set_major_locator(LinearLocator(10))
# ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))

# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()

png

1
2
3
4
5
6
7
x = np.linspace(-10, 10, 100).reshape((-1, 1))
y = np.linspace(-10, 10, 100).reshape((-1, 1))
z = func(x, y)

s = np.hstack((x, y))
v = np.vstack((x,y))
x.shape, y.shape, z.shape, s.shape, v.shape

输出:

((100, 1), (100, 1), (100, 1), (100, 2), (200, 1))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
class TreeNode(object):
def __init__(self, features, label, depth, max_depth):
self.left = None
self.right = None
self.depth = depth
self.max_depth = max_depth
if depth >= max_depth:
self.is_leaf = True
else:
self.is_leaf = False

self.x_data = features.copy()
self.y_data = label.copy()
self.value = None
self.split_feature_index = None
self.split_value = None


def fit(self):
if self.is_leaf :
self.value = np.mean(self.y_data)
elif len(self.x_data) <= 1:
self.is_leaf = True
self.value = np.mean(self.y_data[0,0])
else :
best_loss,loss = np.inf, np.inf
rows, cols = self.x_data.shape
best_domain_1_tf , best_domain_2_tf = None, None

for feature_index in range(cols):
for i in range(0, rows):
domain_1_tf = self.x_data[:, feature_index] <= self.x_data[i, feature_index]
domain_2_tf = self.x_data[:, feature_index] > self.x_data[i, feature_index]
domain_1 = self.y_data[domain_1_tf]
domain_2 = self.y_data[domain_2_tf]

if (len(domain_1) >0) and (len(domain_2)>0):
c1 = np.mean(domain_1)
c2 = np.mean(domain_2)
loss = np.mean(domain_1 - c1) + np.mean(domain_2 - c2)

if loss < best_loss:
best_loss = loss
self.split_value = self.x_data[i, feature_index]
self.split_feature_index = feature_index
best_domain_1_tf = domain_1_tf
best_domain_2_tf = domain_2_tf

if (len(self.y_data[best_domain_1_tf]) == 0) or (len(self.y_data[best_domain_2_tf]) == 0) :
self.value = np.mean(self.y_data)
self.is_leaf = True
else :
self.left = TreeNode(self.x_data[best_domain_1_tf, :], self.y_data[best_domain_1_tf], self.depth + 1, self.max_depth)
self.left.fit()
self.right = TreeNode(self.x_data[best_domain_2_tf, :], self.y_data[best_domain_2_tf], self.depth + 1, self.max_depth)
self.right.fit()

def predict(self, x_pred):
if self.is_leaf :
return self.value
else :
if x_pred[self.split_feature_index] <= self.split_value and self.left is not None:
return self.left.predict(x_pred)
elif x_pred[self.split_feature_index] > self.split_value and self.right is not None:
return self.right.predict(x_pred)
else :
return self.value


class RegressionTree(object):
def __init__(self, x, y, max_depth = 1):
self.max_depth = max_depth
self.root = TreeNode(x, y, 1, self.max_depth)

def fit(self):
self.root.fit()

def predict(self, x_pred):
return self.root.predict(x_pred)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
x = np.linspace(-10, 10, 100).reshape(-1, 1)
# y = np.linspace(-20, 20, 100) + np.random.normal(loc = 0, scale = 3.5 , size=(100,))
y = 0.5 * x * x + np.random.normal(loc = 0, scale = 3.5 , size=(100,1))
# y = 2 * x + np.random.normal(loc = 0, scale = 3.5 , size=(100,1))


plt.figure(figsize=(10,5))
plt.scatter(x, y)

for i in range(1,15, 2):
maxDepth = i
myTree = RegressionTree(x, y, max_depth = maxDepth)
myTree.fit()

x_predict = x
y_predict = [myTree.predict(s) for s in x_predict]


plt.figure(figsize=(10, 5))
plt.scatter(x, y, s=10, color='r')
plt.plot(x_predict, y_predict, label="max_depth={}".format(maxDepth))
plt.title('Regression Tree: max_depth={}'.format(maxDepth))
plt.legend(loc='best')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

png

png

png

png

png

png

png

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
shape = 150

x = np.linspace(-10, 10, shape).reshape((-1, 1))
y = np.linspace(-10, 10, shape).reshape((-1, 1))
x, y = np.meshgrid(x,y)
z = func(x, y)

xd = x.reshape(-1, 1)*x.reshape(-1, 1)
yd = y.reshape(-1, 1)*y.reshape(-1, 1)
xy = x.reshape(-1, 1)*y.reshape(-1, 1)
print(xd.shape, yd.shape)

xx = np.hstack((x.reshape(-1, 1), y.reshape(-1, 1), xd, yd, xy))

maxDepth = 20
myTree = RegressionTree(xx, z.reshape(-1, 1), max_depth = maxDepth)
myTree.fit()

yppp = [myTree.predict(xv) for xv in xx]




fig = plt.figure(figsize=(15,15))
ax = fig.gca(projection='3d')
ax.scatter3D(x, y, z, cmap='Greens')
ax.scatter3D(x, y, np.array(yppp).reshape(shape, shape), cmap='Reds')

# surf1 = ax.plot_surface(x, y, z, cmap=plt.cm.coolwarm, linewidth=0, antialiased=False)
# surf2 = ax.plot_surface(x, y, np.array(yppp).reshape(100, 100), cmap=plt.cm.coolwarm, linewidth=0, antialiased=False)
# ax.set_zlim(0.01, 64.01)
plt.show()
(22500, 1) (22500, 1)

png

1
2
3
4
5
a1 = np.linspace(1,10, 20).reshape(10,2)


a2 = np.linspace(1,10, 10)
a1, a2