diff --git "a/11 \345\256\236\350\267\265\346\226\271\346\263\225\350\256\272.pdf" "b/11 \345\256\236\350\267\265\346\226\271\346\263\225\350\256\272.pdf"
new file mode 100644
index 0000000..8ffcc49
Binary files /dev/null and "b/11 \345\256\236\350\267\265\346\226\271\346\263\225\350\256\272.pdf" differ
diff --git "a/\347\272\277\346\200\247\344\273\243\346\225\260.pdf" "b/2 \347\272\277\346\200\247\344\273\243\346\225\260.pdf"
similarity index 69%
rename from "\347\272\277\346\200\247\344\273\243\346\225\260.pdf"
rename to "2 \347\272\277\346\200\247\344\273\243\346\225\260.pdf"
index 43f29d3..321687e 100644
Binary files "a/\347\272\277\346\200\247\344\273\243\346\225\260.pdf" and "b/2 \347\272\277\346\200\247\344\273\243\346\225\260.pdf" differ
diff --git "a/\346\246\202\347\216\207\344\270\216\344\277\241\346\201\257\350\256\272.pdf" "b/3 \346\246\202\347\216\207\344\270\216\344\277\241\346\201\257\350\256\272.pdf"
similarity index 88%
rename from "\346\246\202\347\216\207\344\270\216\344\277\241\346\201\257\350\256\272.pdf"
rename to "3 \346\246\202\347\216\207\344\270\216\344\277\241\346\201\257\350\256\272.pdf"
index d056495..e2ead6a 100644
Binary files "a/\346\246\202\347\216\207\344\270\216\344\277\241\346\201\257\350\256\272.pdf" and "b/3 \346\246\202\347\216\207\344\270\216\344\277\241\346\201\257\350\256\272.pdf" differ
diff --git "a/\346\225\260\345\200\274\350\256\241\347\256\227.pdf" "b/4 \346\225\260\345\200\274\350\256\241\347\256\227.pdf"
similarity index 58%
rename from "\346\225\260\345\200\274\350\256\241\347\256\227.pdf"
rename to "4 \346\225\260\345\200\274\350\256\241\347\256\227.pdf"
index 2107fb5..b48692b 100644
Binary files "a/\346\225\260\345\200\274\350\256\241\347\256\227.pdf" and "b/4 \346\225\260\345\200\274\350\256\241\347\256\227.pdf" differ
diff --git "a/5 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200.pdf" "b/5 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200.pdf"
new file mode 100644
index 0000000..f279056
Binary files /dev/null and "b/5 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200.pdf" differ
diff --git "a/\346\267\261\345\272\246\345\211\215\351\246\210\347\275\221\347\273\234.pdf" "b/6 \346\267\261\345\272\246\345\211\215\351\246\210\347\275\221\347\273\234.pdf"
similarity index 91%
rename from "\346\267\261\345\272\246\345\211\215\351\246\210\347\275\221\347\273\234.pdf"
rename to "6 \346\267\261\345\272\246\345\211\215\351\246\210\347\275\221\347\273\234.pdf"
index e70f89e..e74c85d 100644
Binary files "a/\346\267\261\345\272\246\345\211\215\351\246\210\347\275\221\347\273\234.pdf" and "b/6 \346\267\261\345\272\246\345\211\215\351\246\210\347\275\221\347\273\234.pdf" differ
diff --git "a/7 \346\267\261\345\272\246\345\255\246\344\271\240\344\270\255\347\232\204\346\255\243\345\210\231\345\214\226.pdf" "b/7 \346\267\261\345\272\246\345\255\246\344\271\240\344\270\255\347\232\204\346\255\243\345\210\231\345\214\226.pdf"
new file mode 100644
index 0000000..864721e
Binary files /dev/null and "b/7 \346\267\261\345\272\246\345\255\246\344\271\240\344\270\255\347\232\204\346\255\243\345\210\231\345\214\226.pdf" differ
diff --git "a/8 \346\267\261\345\272\246\346\250\241\345\236\213\344\270\255\347\232\204\344\274\230\345\214\226.pdf" "b/8 \346\267\261\345\272\246\346\250\241\345\236\213\344\270\255\347\232\204\344\274\230\345\214\226.pdf"
new file mode 100644
index 0000000..9264251
Binary files /dev/null and "b/8 \346\267\261\345\272\246\346\250\241\345\236\213\344\270\255\347\232\204\344\274\230\345\214\226.pdf" differ
diff --git "a/\345\215\267\347\247\257\347\275\221\347\273\234.pdf" "b/9 \345\215\267\347\247\257\347\275\221\347\273\234.pdf"
similarity index 81%
rename from "\345\215\267\347\247\257\347\275\221\347\273\234.pdf"
rename to "9 \345\215\267\347\247\257\347\275\221\347\273\234.pdf"
index 5deed4e..32e7298 100644
Binary files "a/\345\215\267\347\247\257\347\275\221\347\273\234.pdf" and "b/9 \345\215\267\347\247\257\347\275\221\347\273\234.pdf" differ
diff --git a/README.md b/README.md
index 06cca62..1ab3767 100755
--- a/README.md
+++ b/README.md
@@ -2,18 +2,38 @@
 
 《**深度学习**》是深度学习领域唯一的综合性图书，全称也叫做**深度学习 AI圣经(Deep Learning)**，由三位全球知名专家IanGoodfellow、YoshuaBengio、AaronCourville编著，全书囊括了数学及相关概念的背景知识，包括线性代数、概率论、信息论、数值优化以及机器学习中的相关内容。同时，它还介绍了工业界中实践者用到的深度学习技术，包括深度前馈网络、正则化、优化算法、卷积网络、序列建模和实践方法等，并且调研了诸如自然语言处理、语音识别、计算机视觉、在线推荐系统、生物信息学以及视频游戏方面的应用。最后，深度学习全书还提供了一些研究方向，涵盖的理论主题包括线性因子模型、自编码器、表示学习、结构化概率模型、蒙特卡罗方法、配分函数、近似推断以及深度生成模型，适用于相关专业的大学生或研究生使用。
 
-<img src="./docs/cover.jpg" width="400" height="600" alt="深度学习封面" align=center>
+<img src="./docs/cover.jpg" width="200" height="300" alt="深度学习封面" align=center>
 
-可以下载《深度学习》的中文版[pdf](https://github.com/MingchaoZhu/DeepLearning/raw/master/DL%E4%B8%AD%E6%96%87.pdf)和英文版[pdf](https://github.com/MingchaoZhu/DeepLearning/raw/master/DL%E8%8B%B1%E6%96%87.pdf)直接阅读。
+可以下载《深度学习》的中文版 [pdf](https://github.com/MingchaoZhu/DeepLearning/raw/master/DL%E4%B8%AD%E6%96%87.pdf) 和英文版 [pdf](https://github.com/MingchaoZhu/DeepLearning/raw/master/DL%E8%8B%B1%E6%96%87.pdf) 直接阅读。
 
 ---
 
-《深度学习》可以说是深度学习与人工智能的入门宝典，许多算法爱好者、机器学习培训班、互联网企业的面试，很多都参考这本书。但本书晦涩，加上官方没有提供代码实现，因此某些地方较难理解。本站**基于数学推导和产生原理重新描述了书中的概念**，并用**Python** (numpy 库为主) 复现了书本内容（推导过程和代码实现均见**pdf文件**，重要部分的实现代码也放入**code文件夹**中）。
+《深度学习》可以说是深度学习与人工智能的入门宝典，许多算法爱好者、机器学习培训班、互联网企业的面试，很多都参考这本书。但本书晦涩，加上官方没有提供代码实现，因此某些地方较难理解。本站**基于数学推导和产生原理重新描述了书中的概念**，并用**Python** (numpy 库为主) 复现了书本内容 ( **源码级代码实现。推导过程和代码实现均放在了下载区的 pdf 文件中**，重要部分的实现代码也放入 **code 文件夹**中 )。
 
-然而我水平有限，但我真诚地希望这项工作可以帮助到更多人学习深度学习算法。我需要大家的建议和帮助。如果你在阅读中遇到有误或解释不清的地方，希望可以汇总你的建议，提issue （最好不要一个一个地提）。如果你也想加入这项工作书写中或有其他问题，可以联系我的邮箱：deityrayleigh@gmail.com。
+然而我水平有限，但我真诚地希望这项工作可以帮助到更多人学习深度学习算法。我需要大家的建议和帮助。如果你在阅读中遇到有误或解释不清的地方，希望可以汇总你的建议，提issue。如果你也想加入这项工作书写中或有其他问题，可以联系我的邮箱：deityrayleigh@gmail.com。
 
 写的过程中参考了较多网上优秀的工作，所有参考资源保存在了`reference.txt`文件中。
 
+# 作者留言
+
+最近收到了一些读者的催更邮件，感谢认可，但依旧想在此统一解释一下。每个章节的制作，从每一个概念的详细描述、原理推导、作图、代码实现到生成最终的 pdf 文件，需要时间。为了可以解释清楚，你在 pdf 文件中看到的所有的图几乎都是我自己画的。如果你在阅读过程中遇到有想要描述的概念点，可以发邮件告知我。这个项目的工作会一直更新完，不会咕。最后，如果你认可这份工作的话，希望可以 watch、star、fork 三连一下，或者在其他平台转发推广。非常感谢你的认可与推广，谢谢！			——朱明超
+
+# 更新说明
+
+2020/3/：
+
+	1. 修改第五章决策树部分，补充 ID3 和 CART 的原理，代码实现以 CART 为主。
+ 	2. 第七章添加 L1 和 L2 正则化最优解的推导 (即 L1稀疏解的原理)。
+ 	3. 第七章添加集成学习方法的推导与代码实现，包括 Bagging (随机森林)、Boosting (Adaboost、GBDT、XGBoost)
+ 	4. 第八章添加牛顿法与拟牛顿法 (DFP、BFGS、L-BFGS) 的推导。
+ 	5. 第十一章节添加高斯过程回归 (GPR) 与贝叶斯优化的推导与代码实现。
+
+后面每次的更新内容会统一放在 `update.txt` 文件中。
+
+# 章节目录与文件下载
+
+除了《深度学习》书中的概念点，**本项目也在各章节添加一些补充知识，例如第七章集成学习部分的 随机森林、Adaboost、GBDT、XGBoost 的原理剖析和代码实现等，又或者第十二章对当前一些主流方法的描述**。大的章节目录和 pdf 文件下载链接可以详见下表，而具体 pdf 文件中的实际目录请参考 `contents.txt`。
+
 | 中文章节 | 英文章节 | 下载<br />(含推导与代码实现) |
 | ------------ | ------------ | ------------ |
 | 第一章 前言 | 1 Introduction |  |
diff --git a/code/chapter 11.py b/code/chapter 11.py
new file mode 100644
index 0000000..492b879
--- /dev/null
+++ b/code/chapter 11.py	
@@ -0,0 +1,289 @@
+import pandas as pd
+import numpy as np
+import itertools
+import time
+import re
+from scipy.stats import norm
+import matplotlib.pyplot as plt
+
+
+def cal_conf_matrix(labels, preds):
+    """
+    计算混淆矩阵。
+    
+    参数说明：
+    labels：样本标签 (真实结果)
+    preds：预测结果
+    """
+    n_sample = len(labels)
+    result = pd.DataFrame(index=range(0,n_sample),columns=('probability','label'))
+    result['label'] = np.array(labels)
+    result['probability'] = np.array(preds)
+    cm = np.arange(4).reshape(2,2)
+    cm[0,0] = len(result[result['label']==1][result['probability']>=0.5]) # TP，注意这里是以 0.5 为阈值
+    cm[0,1] = len(result[result['label']==1][result['probability']<0.5])  # FN
+    cm[1,0] = len(result[result['label']==0][result['probability']>=0.5]) # FP
+    cm[1,1] = len(result[result['label']==0][result['probability']<0.5])  # TN  
+    return cm
+
+
+def cal_PRF1(labels, preds):
+    """
+    计算查准率P，查全率R，F1值。
+    """
+    cm = cal_conf_matrix(labels, preds)
+    P = cm[0,0]/(cm[0,0]+cm[1,0])
+    R = cm[0,0]/(cm[0,0]+cm[0,1])
+    F1 = 2*P*R/(P+R)
+    return P, R, F1
+
+
+def cal_PRcurve(labels, preds):
+    """
+    计算PR曲线上的值。
+    """
+    n_sample = len(labels)
+    result = pd.DataFrame(index=range(0,n_sample),columns=('probability','label'))
+    y_pred[y_pred>=0.5] = 1
+    y_pred[y_pred<0.5] = 0
+    result['label'] = np.array(labels)
+    result['probability'] = np.array(preds)
+    result.sort_values('probability',inplace=True,ascending=False)
+    PandR = pd.DataFrame(index=range(len(labels)),columns=('P','R'))
+    for j in range(len(result)):
+        # 以每一个概率为分类的阈值，统计此时正例和反例的数量
+        result_j = result.head(n=j+1)
+        P = len(result_j[result_j['label']==1])/float(len(result_j))  # 当前实际为正的数量/当前预测为正的数量
+        R = len(result_j[result_j['label']==1])/float(len(result[result['label']==1]))  # 当前真正例的数量/实际为正的数量
+        PandR.iloc[j] = [P,R]
+    return PandR
+
+
+def cal_ROCcurve(labels, preds):
+    """
+    计算ROC曲线上的值。
+    """
+    n_sample = len(labels)
+    result = pd.DataFrame(index=range(0,n_sample),columns=('probability','label'))
+    y_pred[y_pred>=0.5] = 1
+    y_pred[y_pred<0.5] = 0
+    result['label'] = np.array(labels)
+    result['probability'] = np.array(preds)
+    # 计算 TPR,FPR
+    result.sort_values('probability',inplace=True,ascending=False)
+    TPRandFPR=pd.DataFrame(index=range(len(result)),columns=('TPR','FPR'))
+    for j in range(len(result)):
+        # 以每一个概率为分类的阈值，统计此时正例和反例的数量
+        result_j=result.head(n=j+1)
+        TPR=len(result_j[result_j['label']==1])/float(len(result[result['label']==1]))  # 当前真正例的数量/实际为正的数量
+        FPR=len(result_j[result_j['label']==0])/float(len(result[result['label']==0]))  # 当前假正例的数量/实际为负的数量
+        TPRandFPR.iloc[j]=[TPR,FPR]
+    return TPRandFPR
+
+
+def timeit(func):
+    """
+    装饰器，计算函数执行时间
+    """
+    def wrapper(*args, **kwargs):
+        time_start = time.time()
+        result = func(*args, **kwargs)
+        time_end = time.time()
+        exec_time = time_end - time_start
+        print("{function} exec time: {time}s".format(function=func.__name__,time=exec_time))
+        return result
+    return wrapper
+
+@timeit
+def area_auc(labels, preds):
+    """
+    AUC值的梯度法计算
+    """
+    TPRandFPR = cal_ROCcurve(labels, preds)
+    # 计算AUC，计算小矩形的面积之和
+    auc = 0.
+    prev_x = 0
+    for x, y in zip(TPRandFPR.FPR,TPRandFPR.TPR):
+        if x != prev_x:
+            auc += (x - prev_x) * y
+            prev_x = x
+    return auc
+
+@timeit
+def naive_auc(labels, preds):
+    """
+    AUC值的概率法计算
+    """
+    n_pos = sum(labels)
+    n_neg = len(labels) - n_pos
+    total_pair = n_pos * n_neg  # 总的正负样本对的数目
+    labels_preds = zip(labels, preds)
+    labels_preds = sorted(labels_preds,key=lambda x:x[1])  # 对预测概率升序排序
+    count_neg = 0  # 统计负样本出现的个数
+    satisfied_pair = 0   # 统计满足条件的样本对的个数
+    for i in range(len(labels_preds)):
+        if labels_preds[i][0] == 1:
+            satisfied_pair += count_neg  # 表明在这个正样本下，有哪些负样本满足条件
+        else:
+            count_neg += 1
+    return satisfied_pair / float(total_pair)
+
+
+#####----Bayesian Hyperparameter Optimization----####
+class KernelBase(ABC):
+    
+    def __init__(self):
+        super().__init__()
+        self.params = {}
+        self.hyperparams = {}
+
+    @abstractmethod
+    def _kernel(self, X, Y):
+        raise NotImplementedError
+
+    def __call__(self, X, Y=None):
+        return self._kernel(X, Y)
+
+    def __str__(self):
+        P, H = self.params, self.hyperparams
+        p_str = ", ".join(["{}={}".format(k, v) for k, v in P.items()])
+        return "{}({})".format(H["op"], p_str)
+
+    def summary(self):
+        return {
+            "op": self.hyperparams["op"],
+            "params": self.params,
+            "hyperparams": self.hyperparams,
+        }
+
+
+class RBFKernel(KernelBase):
+    
+    def __init__(self, sigma=None):
+        """
+        RBF 核。
+        """
+        super().__init__()
+        self.hyperparams = {"op": "RBFKernel"}
+        self.params = {"sigma": sigma}  # 如果 sigma 未赋值则默认为 np.sqrt(n_features/2)，n_features 为特征数。
+
+    def _kernel(self, X, Y=None):
+        """
+        对 X 和 Y 的行的每一对计算 RBF 核。如果 Y 为空，则 Y=X。
+
+        参数说明：
+        X：输入数组，为 (n_samples, n_features)
+        Y：输入数组，为 (m_samples, n_features)
+        """
+        X = X.reshape(-1, 1) if X.ndim == 1 else X
+        Y = X if Y is None else Y
+        Y = Y.reshape(-1, 1) if Y.ndim == 1 else Y
+        assert X.ndim == 2 and Y.ndim == 2, "X and Y must have 2 dimensions"
+        sigma = np.sqrt(X.shape[1] / 2) if self.params["sigma"] is None else self.params["sigma"]
+        X, Y = X / sigma, Y / sigma
+        D = -2 * X @ Y.T + np.sum(Y**2, axis=1) + np.sum(X**2, axis=1)[:, np.newaxis]
+        D[D < 0] = 0
+        return np.exp(-0.5 * D)
+    
+
+class KernelInitializer(object):
+    
+    def __init__(self, param=None):
+        self.param = param
+
+    def __call__(self):
+        r = r"([a-zA-Z0-9]*)=([^,)]*)"
+        kr_str = self.param.lower()
+        kwargs = dict([(i, eval(j)) for (i, j) in re.findall(r, self.param)])
+        if "rbf" in kr_str:
+            kernel = RBFKernel(**kwargs)
+        else:
+            raise NotImplementedError("{}".format(kr_str))
+        return kernel
+
+
+class GPRegression:
+    """
+    高斯过程回归
+    """
+    def __init__(self, kernel="RBFKernel", sigma=1e-10):
+        self.kernel = KernelInitializer(kernel)()
+        self.params = {"GP_mean": None, "GP_cov": None, "X": None}
+        self.hyperparams = {"kernel": str(self.kernel), "sigma": sigma}
+
+    def fit(self, X, y):
+        """
+        用已有的样本集合得到 GP 先验。
+
+        参数说明：
+        X：输入数组，为 (n_samples, n_features)
+        y：输入数组 X 的目标值，为 (n_samples)
+        """
+        mu = np.zeros(X.shape[0])
+        Cov = self.kernel(X, X)
+        self.params["X"] = X
+        self.params["y"] = y
+        self.params["GP_cov"] = Cov
+        self.params["GP_mean"] = mu
+
+    def predict(self, X_star, conf_interval=0.95):
+        """
+        对新的样本 X 进行预测。
+
+        参数说明：
+        X_star：输入数组，为 (n_samples, n_features)
+        conf_interval：置信区间，浮点型 (0, 1)，default=0.95
+        """
+        X = self.params["X"]
+        y = self.params["y"]
+        K = self.params["GP_cov"]
+        sigma = self.hyperparams["sigma"]
+        K_star = self.kernel(X_star, X)
+        K_star_star = self.kernel(X_star, X_star)
+        sig = np.eye(K.shape[0]) * sigma
+        K_y_inv = np.linalg.pinv(K + sig)
+        mean = K_star @ K_y_inv @ y
+        cov = K_star_star - K_star @ K_y_inv @ K_star.T
+        percentile = norm.ppf(conf_interval)
+        conf = percentile * np.sqrt(np.diag(cov))
+        return mean, conf, cov
+
+
+class BayesianOptimization:
+    
+    def __init__(self):
+        self.model = GPRegression()
+        
+    def acquisition_function(self, Xsamples):
+        mu, _, cov = self.model.predict(Xsamples)
+        mu = mu if mu.ndim==1 else (mu.T)[0]
+        ysample = np.random.multivariate_normal(mu, cov) 
+        return ysample
+    
+    def opt_acquisition(self, X, n_samples=20):
+        # 样本搜索策略，一般方法有随机搜索、基于网格的搜索，或局部搜索
+        # 我们这里就用简单的随机搜索，这里也可以定义样本的范围
+        Xsamples = np.random.randint(low=1,high=50,size=n_samples*X.shape[1])
+        Xsamples = Xsamples.reshape(n_samples, X.shape[1])
+        # 计算采集函数的值并取最大的值
+        scores = self.acquisition_function(Xsamples)
+        ix = np.argmax(scores)
+        return Xsamples[ix, 0]
+    
+    def fit(self, f, X, y):
+        # 拟合 GPR 模型
+        self.model.fit(X, y)
+        # 优化过程
+        for i in range(15):
+            x_star = self.opt_acquisition(X)  # 下一个采样点
+            y_star = f(x_star)
+            mean, conf, cov = self.model.predict(np.array([[x_star]]))
+            # 添加当前数据到数据集合
+            X = np.vstack((X, [[x_star]]))
+            y = np.vstack((y, [[y_star]]))
+            # 更新 GPR 模型
+            self.model.fit(X, y)
+        ix = np.argmax(y)
+        print('Best Result: x=%.3f, y=%.3f' % (X[ix], y[ix]))
+        return X[ix], y[ix]    
+
diff --git a/code/chapter5.py b/code/chapter5.py
index c240311..01d597e 100644
--- a/code/chapter5.py
+++ b/code/chapter5.py
@@ -1,5 +1,6 @@
 import numpy as np
 import cvxopt
+import math
 
 
 ########-----NaiveBayes------#########
@@ -56,6 +57,11 @@ def _calculate_probabilities(self, X):
     def predict(self, X):
         y_pred = [self._calculate_probabilities(sample) for sample in X]
         return y_pred
+    
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
 
 
 ########-----LogisticRegression------#########
@@ -88,6 +94,11 @@ def predict(self, X):
         y_pred = self.sigmoid(X.dot(self.param))
         return y_pred
 
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
+    
 
 ########-----SupportVectorMachine------#########
 # 隐藏cvxopt输出
@@ -190,7 +201,12 @@ def predict(self, X):
             y_pred.append(np.sign(prediction))
         return np.array(y_pred)
     
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
 
+    
 ########-----KNN------#########
 class KNN():
     
@@ -235,7 +251,12 @@ def predict(self, X):
             y_pred.append(np.sign(prediction))
         return np.array(y_pred)
 
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
 
+    
 ########-----DecisionTree------#########
 class DecisionNode():
 
@@ -366,6 +387,11 @@ def predict(self, X):
         y_pred = [self.predict_value(sample) for sample in X]
         return y_pred
 
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
+    
     def print_tree(self, tree=None, indent=" "):
         """
         输出树
@@ -394,10 +420,31 @@ def calculate_entropy(y):
     return entropy
 
 
+def calculate_gini(y):
+    unique_labels = np.unique(y)
+    var = 0
+    for label in unique_labels:
+        count = len(y[y == label])
+        p = count / len(y)
+        var += p ** 2
+    return 1 - var
+
+
 class ClassificationTree(DecisionTree):
     """
-    分类树，在决策书节点选择计算信息增益，在叶子节点选择多数表决
+    分类树，在决策书节点选择计算信息增益/基尼指数，在叶子节点选择多数表决。
     """
+    def _calculate_gini_index(self, y, y1, y2):
+        """
+        计算基尼指数
+        """
+        p = len(y1) / len(y)
+        gini = calculate_gini(y)
+        gini_index = gini - p * \
+            calculate_gini(y1) - (1 - p) * \
+            calculate_gini(y2)
+        return gini_index
+    
     
     def _calculate_information_gain(self, y, y1, y2):
         """
@@ -408,7 +455,6 @@ def _calculate_information_gain(self, y, y1, y2):
         info_gain = entropy - p * \
             calculate_entropy(y1) - (1 - p) * \
             calculate_entropy(y2)
-
         return info_gain
 
     def _majority_vote(self, y):
@@ -425,40 +471,58 @@ def _majority_vote(self, y):
         return most_common
 
     def fit(self, X, y):
-        self._impurity_calculation = self._calculate_information_gain
+        self._impurity_calculation = self._calculate_gini_index
         self._leaf_value_calculation = self._majority_vote
         super(ClassificationTree, self).fit(X, y)
 
-        
-def calculate_variance(X):
-    mean = np.ones(np.shape(X)) * X.mean(0)
-    n_samples = np.shape(X)[0]
-    variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean))
+
+def calculate_mse(y):
+    return np.mean((y - np.mean(y)) ** 2)
+
+
+def calculate_variance(y):
+    n_samples = np.shape(y)[0]
+    variance = (1 / n_samples) * np.diag((y - np.mean(y)).T.dot(y - np.mean(y)))
     return variance
 
 
 class RegressionTree(DecisionTree):
     """
-    回归树，在决策书节点选择计算方差降低，在叶子节点选择均值
+    回归树，在决策书节点选择计算MSE/方差降低，在叶子节点选择均值。
     """
+    def _calculate_mse(self, y, y1, y2):
+        """
+        计算MSE降低
+        """
+        mse_tot = calculate_mse(y)
+        mse_1 = calculate_mse(y1)
+        mse_2 = calculate_mse(y2)
+        frac_1 = len(y1) / len(y)
+        frac_2 = len(y2) / len(y)
+        mse_reduction = mse_tot - (frac_1 * mse_1 + frac_2 * mse_2)
+        return mse_reduction
     
     def _calculate_variance_reduction(self, y, y1, y2):
+        """
+        计算方差降低
+        """
         var_tot = calculate_variance(y)
         var_1 = calculate_variance(y1)
         var_2 = calculate_variance(y2)
         frac_1 = len(y1) / len(y)
         frac_2 = len(y2) / len(y)
-
         variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2)
-
         return sum(variance_reduction)
 
     def _mean_of_y(self, y):
+        """
+        计算均值
+        """
         value = np.mean(y, axis=0)
         return value if len(value) > 1 else value[0]
 
     def fit(self, X, y):
-        self._impurity_calculation = self._calculate_variance_reduction
+        self._impurity_calculation = self._calculate_mse
         self._leaf_value_calculation = self._mean_of_y
         super(RegressionTree, self).fit(X, y)
 
diff --git a/code/chapter7.py b/code/chapter7.py
index 277de6a..3b6a909 100644
--- a/code/chapter7.py
+++ b/code/chapter7.py
@@ -1,7 +1,9 @@
 from abc import ABC, abstractmethod
 import numpy as np
+import math
 import re
-
+import progressbar
+from chapter5 import RegressionTree, DecisionTree, ClassificationTree
 
 #########---Regularizer---######
 class RegularizerBase(ABC):
@@ -273,3 +275,556 @@ def hyperparams(self):
         else:
             hp["wrappers"] = [hpw]
         return hp
+
+
+#####----Bagging----#######
+# 进度条
+bar_widgets = [
+    'Training: ', progressbar.Percentage(), ' ', progressbar.Bar(marker="-", left="[", right="]"),
+    ' ', progressbar.ETA()
+]
+
+def get_random_subsets(X, y, n_subsets, replacements=True):
+    """从训练数据中抽取数据子集 (默认可重复抽样)"""
+    n_samples = np.shape(X)[0]
+    # 将 X 和 y 拼接，并将元素随机排序
+    Xy = np.concatenate((X, y.reshape((1, len(y))).T), axis=1)
+    np.random.shuffle(Xy)
+    subsets = []
+    # 如果抽样时不重复抽样，可以只使用 50% 的训练数据；如果抽样时可重复抽样，使用全部的训练数据，默认可重复抽样
+    subsample_size = int(n_samples // 2)
+    if replacements:
+        subsample_size = n_samples      
+    for _ in range(n_subsets):
+        idx = np.random.choice(
+            range(n_samples),
+            size=np.shape(range(subsample_size)),
+            replace=replacements)
+        X = Xy[idx][:, :-1]
+        y = Xy[idx][:, -1]
+        subsets.append([X, y])
+    return subsets
+
+
+class Bagging():
+    """
+    Bagging分类器。使用一组分类树，这些分类树使用特征训练数据的随机子集。
+    """
+    def __init__(self, n_estimators=100, max_features=None, min_samples_split=2,
+                 min_gain=0, max_depth=float("inf")):
+        self.n_estimators = n_estimators    # 树的数目
+        self.min_samples_split = min_samples_split   # 分割所需的最小样本数
+        self.min_gain = min_gain            # 分割所需的最小纯度 (最小信息增益)
+        self.max_depth = max_depth          # 树的最大深度
+        self.progressbar = progressbar.ProgressBar(widgets=bar_widgets)
+
+        # 初始化决策树
+        self.trees = []
+        for _ in range(n_estimators):
+            self.trees.append(
+                ClassificationTree(
+                    min_samples_split=self.min_samples_split,
+                    min_impurity=min_gain,
+                    max_depth=self.max_depth))
+
+    def fit(self, X, y):
+        # 对每棵树选择数据集的随机子集
+        subsets = get_random_subsets(X, y, self.n_estimators)
+        for i in self.progressbar(range(self.n_estimators)):
+            X_subset, y_subset = subsets[i]
+            # 用特征子集和真实值训练一棵子模型 (这里的数据也是训练数据集的随机子集)
+            self.trees[i].fit(X_subset, y_subset)
+
+    def predict(self, X):
+        y_preds = np.empty((X.shape[0], len(self.trees)))
+        # 每棵决策树都在数据上预测
+        for i, tree in enumerate(self.trees):
+            # 基于特征做出预测
+            prediction = tree.predict(X)
+            y_preds[:, i] = prediction
+            
+        y_pred = []
+        # 对每个样本，选择最常见的类别作为预测
+        for sample_predictions in y_preds:
+            y_pred.append(np.bincount(sample_predictions.astype('int')).argmax())
+        return y_pred
+    
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
+
+    
+#####----RandomForest----#######
+class RandomForest():
+    """
+    随机森林分类器。使用一组分类树，这些分类树使用特征的随机子集训练数据的随机子集。
+    """
+    def __init__(self, n_estimators=100, max_features=None, min_samples_split=2,
+                 min_gain=0, max_depth=float("inf")):
+        self.n_estimators = n_estimators    # 树的数目
+        self.max_features = max_features    # 每棵树的最大使用特征数
+        self.min_samples_split = min_samples_split   # 分割所需的最小样本数
+        self.min_gain = min_gain            # 分割所需的最小纯度 (最小信息增益)
+        self.max_depth = max_depth          # 树的最大深度
+        self.progressbar = progressbar.ProgressBar(widgets=bar_widgets)
+
+        # 初始化决策树
+        self.trees = []
+        for _ in range(n_estimators):
+            self.trees.append(
+                ClassificationTree(
+                    min_samples_split=self.min_samples_split,
+                    min_impurity=min_gain,
+                    max_depth=self.max_depth))
+
+    def fit(self, X, y):
+        n_features = np.shape(X)[1]
+        # 如果 max_features 没有定义，取默认值 sqrt(n_features)
+        if not self.max_features:
+            self.max_features = int(math.sqrt(n_features))
+
+        # 对每棵树选择数据集的随机子集
+        subsets = get_random_subsets(X, y, self.n_estimators)
+
+        for i in self.progressbar(range(self.n_estimators)):
+            X_subset, y_subset = subsets[i]
+            # 选择特征的随机子集
+            idx = np.random.choice(range(n_features), size=self.max_features, replace=True)
+            # 保存特征的索引用于预测
+            self.trees[i].feature_indices = idx
+            # 选择索引对应的特征
+            X_subset = X_subset[:, idx]
+            # 用特征子集和真实值训练一棵子模型 (这里的数据也是训练数据集的随机子集)
+            self.trees[i].fit(X_subset, y_subset)
+
+    def predict(self, X):
+        y_preds = np.empty((X.shape[0], len(self.trees)))
+        # 每棵决策树都在数据上预测
+        for i, tree in enumerate(self.trees):
+            # 使用该决策树训练使用的特征
+            idx = tree.feature_indices
+            # 基于特征做出预测
+            prediction = tree.predict(X[:, idx])
+            y_preds[:, i] = prediction
+            
+        y_pred = []
+        # 对每个样本，选择最常见的类别作为预测
+        for sample_predictions in y_preds:
+            y_pred.append(np.bincount(sample_predictions.astype('int')).argmax())
+        return y_pred
+    
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
+
+    
+#####----Adaboost----#######
+# 决策树桩，作为 Adaboost 算法的弱分类器 (基分类器)
+class DecisionStump():
+    
+    def __init__(self):
+        self.polarity = 1            # 表示决策树桩默认输出的类别为 1 或是 -1
+        self.feature_index = None    # 用于分类的特征索引
+        self.threshold = None        # 特征的阈值
+        self.alpha = None            # 表示分类器准确性的值
+
+class Adaboost():
+    """
+    Adaboost 算法。
+    """
+    def __init__(self, n_estimators=5):
+        self.n_estimators = n_estimators    # 将使用的弱分类器的数量
+        self.progressbar = progressbar.ProgressBar(widgets=bar_widgets)
+
+    def fit(self, X, y):
+        n_samples, n_features = np.shape(X)
+        # 初始化权重 (上文中的 D)，均为 1/N
+        w = np.full(n_samples, (1 / n_samples))
+        self.trees = []
+        # 迭代过程
+        for _ in self.progressbar(range(self.n_estimators)):
+            tree = DecisionStump()
+            min_error = float('inf')    # 使用某一特征值的阈值预测样本的最小误差
+            # 迭代遍历每个 (不重复的) 特征值，查找预测 y 的最佳阈值
+            for feature_i in range(n_features):
+                feature_values = np.expand_dims(X[:, feature_i], axis=1)
+                unique_values = np.unique(feature_values)
+                # 将该特征的每个特征值作为阈值
+                for threshold in unique_values:
+                    p = 1
+                    # 将所有样本预测默认值可以设置为 1
+                    prediction = np.ones(np.shape(y))
+                    # 低于特征值阈值的预测改为 -1
+                    prediction[X[:, feature_i] < threshold] = -1
+                    # 计算错误率
+                    error = sum(w[y != prediction])
+                    # 如果错误率超过 50%，我们反转决策树桩默认输出的类别
+                    # 比如 error = 0.8 => (1 - error) = 0.2，
+                    # 原来计算的是输出到类别 1 的概率，类别 1 作为默认类别。反转后类别 0 作为默认类别
+                    if error > 0.5:
+                        error = 1 - error
+                        p = -1
+                    # 如果这个阈值导致最小的错误率，则保存
+                    if error < min_error:
+                        tree.polarity = p
+                        tree.threshold = threshold
+                        tree.feature_index = feature_i
+                        min_error = error
+                        
+            # 计算用于更新样本权值的 alpha 值，也是作为基分类器的系数。
+            tree.alpha = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))
+            # 将所有样本预测默认值设置为 1
+            predictions = np.ones(np.shape(y))
+            # 如果特征值低于阈值，则修改预测结果，这里还需要考虑弱分类器的默认输出类别
+            negative_idx = (tree.polarity * X[:, tree.feature_index] < tree.polarity * tree.threshold)
+            predictions[negative_idx] = -1
+            # 计算新权值，未正确分类样本的权值增大，正确分类样本的权值减小
+            w *= np.exp(-tree.alpha * y * predictions)
+            w /= np.sum(w)
+            # 保存分类器
+            self.trees.append(tree)
+
+    def predict(self, X):
+        n_samples = np.shape(X)[0]
+        y_pred = np.zeros((n_samples, 1))
+        # 用每一个基分类器预测样本
+        for tree in self.trees:
+            # 将所有样本预测默认值设置为 1
+            predictions = np.ones(np.shape(y_pred))
+            negative_idx = (tree.polarity * X[:, tree.feature_index] < tree.polarity * tree.threshold)
+            predictions[negative_idx] = -1
+            # 对基分类器加权求和，权重 alpha
+            y_pred += tree.alpha * predictions
+        # 返回预测结果 1 或 -1
+        y_pred = np.sign(y_pred).flatten()
+        return y_pred
+    
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
+
+    
+#####----GBDT----#######
+class Loss(ABC):
+
+    def __init__(self):
+        super().__init__()
+
+    @abstractmethod    
+    def loss(self, y_true, y_pred):
+        return NotImplementedError()
+
+    @abstractmethod    
+    def grad(self, y, y_pred):
+        raise NotImplementedError()
+
+class SquareLoss(Loss):
+    
+    def __init__(self): 
+        pass
+
+    def loss(self, y, y_pred):
+        pass
+
+    def grad(self, y, y_pred):
+        return -(y - y_pred)
+    
+    def hess(self, y, y_pred):
+        return 1
+
+class CrossEntropyLoss(Loss):
+    
+    def __init__(self): 
+        pass
+
+    def loss(self, y, y_pred):
+        pass
+
+    def grad(self, y, y_pred):
+        return - (y - y_pred)  
+    
+    def hess(self, y, y_pred):
+        return y_pred * (1-y_pred)
+
+
+def softmax(x):
+    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
+    return e_x / e_x.sum(axis=-1, keepdims=True)
+
+
+def line_search(self, y, y_pred, h_pred):
+    Lp = 2 * np.sum((y - y_pred) * h_pred)
+    Lpp = np.sum(h_pred * h_pred)
+    return 1 if np.sum(Lpp) == 0 else Lp / Lpp
+
+
+def to_categorical(x, n_classes=None):
+    """
+    One-hot编码
+    """
+    if not n_classes:
+        n_classes = np.amax(x) + 1
+    one_hot = np.zeros((x.shape[0], n_classes))
+    one_hot[np.arange(x.shape[0]), x] = 1
+    return one_hot
+
+
+class GradientBoostingDecisionTree(object):
+    """
+    GBDT 算法。用一组基学习器 (回归树) 学习损失函数的梯度。
+    """
+    def __init__(self, n_estimators, learning_rate=1, min_samples_split=2,
+                 min_impurity=1e-7, max_depth=float("inf"), is_regression=False, line_search=False):
+        self.n_estimators = n_estimators         # 迭代的次数
+        self.learning_rate = learning_rate       # 训练过程中沿着负梯度走的步长，也就是学习率
+        self.min_samples_split = min_samples_split    # 分割所需的最小样本数
+        self.min_impurity = min_impurity         # 分割所需的最小纯度
+        self.max_depth = max_depth               # 树的最大深度
+        self.is_regression = is_regression       # 分类问题或回归问题
+        self.line_search = line_search           # 是否使用 line search
+        self.progressbar = progressbar.ProgressBar(widgets=bar_widgets)        
+        # 回归问题采用基础的平方损失，分类问题采用交叉熵损失
+        self.loss = SquareLoss()
+        if not self.is_regression:
+            self.loss = CrossEntropyLoss()
+
+    def fit(self, X, Y):
+        # 分类问题将 Y 转化为 one-hot 编码
+        if not self.is_regression:
+            Y = to_categorical(Y.flatten())
+        else:
+            Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y
+        self.out_dims = Y.shape[1]
+        self.trees = np.empty((self.n_estimators, self.out_dims), dtype=object)
+        Y_pred = np.full(np.shape(Y), np.mean(Y, axis=0))
+        self.weights = np.ones((self.n_estimators, self.out_dims))
+        self.weights[1:, :] *= self.learning_rate
+        # 迭代过程
+        for i in self.progressbar(range(self.n_estimators)):
+            for c in range(self.out_dims):
+                tree = RegressionTree(
+                        min_samples_split=self.min_samples_split,
+                        min_impurity=self.min_impurity,
+                        max_depth=self.max_depth)
+                # 计算损失的梯度，并用梯度进行训练
+                if not self.is_regression:   
+                    Y_hat = softmax(Y_pred)
+                    y, y_pred = Y[:, c], Y_hat[:, c]
+                else:
+                    y, y_pred = Y[:, c], Y_pred[:, c]
+                neg_grad = -1 * self.loss.grad(y, y_pred)
+                tree.fit(X, neg_grad)
+                # 用新的基学习器进行预测
+                h_pred = tree.predict(X)
+                # line search
+                if self.line_search == True:
+                    self.weights[i, c] *= line_search(y, y_pred, h_pred)
+                # 加法模型中添加基学习器的预测，得到最新迭代下的加法模型预测
+                Y_pred[:, c] += np.multiply(self.weights[i, c], h_pred)
+                self.trees[i, c] = tree
+    
+    def predict(self, X):
+        Y_pred = np.zeros((X.shape[0], self.out_dims))
+        # 生成预测
+        for c in range(self.out_dims):
+            y_pred = np.array([])
+            for i in range(self.n_estimators):
+                update = np.multiply(self.weights[i, c], self.trees[i, c].predict(X))
+                y_pred = update if not y_pred.any() else y_pred + update
+            Y_pred[:, c] = y_pred
+        if not self.is_regression: 
+            # 分类问题输出最可能类别
+            Y_pred = Y_pred.argmax(axis=1)
+        return Y_pred
+    
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
+
+
+class GradientBoostingRegressor(GradientBoostingDecisionTree):
+    
+    def __init__(self, n_estimators=200, learning_rate=1, min_samples_split=2,
+                 min_impurity=1e-7, max_depth=float("inf"), is_regression=True, line_search=False):
+        super(GradientBoostingRegressor, self).__init__(n_estimators=n_estimators, 
+            learning_rate=learning_rate, 
+            min_samples_split=min_samples_split, 
+            min_impurity=min_impurity,
+            max_depth=max_depth,
+            is_regression=is_regression,
+            line_search=line_search)
+
+
+class GradientBoostingClassifier(GradientBoostingDecisionTree):
+    
+    def __init__(self, n_estimators=200, learning_rate=1, min_samples_split=2,
+                 min_impurity=1e-7, max_depth=float("inf"), is_regression=False, line_search=False):
+        super(GradientBoostingClassifier, self).__init__(n_estimators=n_estimators, 
+            learning_rate=learning_rate, 
+            min_samples_split=min_samples_split, 
+            min_impurity=min_impurity,
+            max_depth=max_depth,
+            is_regression=is_regression,
+            line_search=line_search)
+
+        
+#####----XGBoost----#######
+class XGBoostRegressionTree(DecisionTree):
+    """
+    XGBoost 回归树。此处基于第五章介绍的决策树，故采用贪心算法找到特征上分裂点 (枚举特征上所有可能的分裂点)。
+    """
+    def __init__(self, min_samples_split=2, min_impurity=1e-7,
+                 max_depth=float("inf"), loss=None, gamma=0., lambd=0.):
+        super(XGBoostRegressionTree, self).__init__(min_impurity=min_impurity, 
+            min_samples_split=min_samples_split, 
+            max_depth=max_depth)
+        self.gamma = gamma   # 叶子节点的数目的惩罚系数
+        self.lambd = lambd   # 叶子节点的权重的惩罚系数
+        self.loss = loss     # 损失函数
+    
+    def _split(self, y):
+        # y 包含 y_true 在左半列，y_pred 在右半列
+        col = int(np.shape(y)[1]/2)
+        y, y_pred = y[:, :col], y[:, col:]
+        return y, y_pred
+
+    def _gain(self, y, y_pred):
+        # 计算信息
+        nominator = np.power((y * self.loss.grad(y, y_pred)).sum(), 2)
+        denominator = self.loss.hess(y, y_pred).sum()
+        return nominator / (denominator + self.lambd)
+
+    def _gain_by_taylor(self, y, y1, y2):
+        # 分割为左子树和右子树
+        y, y_pred = self._split(y)
+        y1, y1_pred = self._split(y1)
+        y2, y2_pred = self._split(y2)
+        true_gain = self._gain(y1, y1_pred)
+        false_gain = self._gain(y2, y2_pred)
+        gain = self._gain(y, y_pred)
+        # 计算信息增益
+        return 0.5 * (true_gain + false_gain - gain) - self.gamma
+
+    def _approximate_update(self, y):
+        y, y_pred = self._split(y)
+        # 计算叶节点权重
+        gradient = self.loss.grad(y, y_pred).sum()
+        hessian = self.loss.hess(y, y_pred).sum()
+        leaf_approximation = -gradient / (hessian + self.lambd)
+        return leaf_approximation
+
+    def fit(self, X, y):
+        self._impurity_calculation = self._gain_by_taylor
+        self._leaf_value_calculation = self._approximate_update
+        super(XGBoostRegressionTree, self).fit(X, y)
+
+
+class XGBoost(object):
+    """
+    XGBoost学习器。
+    """
+    def __init__(self, n_estimators=200, learning_rate=0.001, min_samples_split=2,
+                 min_impurity=1e-7, max_depth=2, is_regression=False, gamma=0., lambd=0.):
+        self.n_estimators = n_estimators            # 树的数目
+        self.learning_rate = learning_rate          # 训练过程中沿着负梯度走的步长，也就是学习率
+        self.min_samples_split = min_samples_split  # 分割所需的最小样本数
+        self.min_impurity = min_impurity            # 分割所需的最小纯度
+        self.max_depth = max_depth                  # 树的最大深度
+        self.gamma = gamma                          # 叶子节点的数目的惩罚系数
+        self.lambd = lambd                          # 叶子节点的权重的惩罚系数
+        self.is_regression = is_regression          # 分类或回归问题
+        self.progressbar = progressbar.ProgressBar(widgets=bar_widgets)
+        # 回归问题采用基础的平方损失，分类问题采用交叉熵损失
+        self.loss = SquareLoss()
+        if not self.is_regression:
+            self.loss = CrossEntropyLoss()
+
+    def fit(self, X, Y):
+        # 分类问题将 Y 转化为 one-hot 编码
+        if not self.is_regression:
+            Y = to_categorical(Y.flatten())
+        else:
+            Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y
+        self.out_dims = Y.shape[1]
+        self.trees = np.empty((self.n_estimators, self.out_dims), dtype=object)
+        Y_pred = np.zeros(np.shape(Y))
+        self.weights = np.ones((self.n_estimators, self.out_dims))
+        self.weights[1:, :] *= self.learning_rate
+        # 迭代过程
+        for i in self.progressbar(range(self.n_estimators)):
+            for c in range(self.out_dims):
+                tree = XGBoostRegressionTree(
+                        min_samples_split=self.min_samples_split,
+                        min_impurity=self.min_impurity,
+                        max_depth=self.max_depth,
+                        loss=self.loss,
+                        gamma=self.gamma,
+                        lambd=self.lambd)
+                # 计算损失的梯度，并用梯度进行训练
+                if not self.is_regression:   
+                    Y_hat = softmax(Y_pred)
+                    y, y_pred = Y[:, c], Y_hat[:, c]
+                else:
+                    y, y_pred = Y[:, c], Y_pred[:, c]
+
+                y, y_pred = y.reshape(-1, 1), y_pred.reshape(-1, 1)
+                y_and_ypred = np.concatenate((y, y_pred), axis=1)
+                tree.fit(X, y_and_ypred)
+                # 用新的基学习器进行预测
+                h_pred = tree.predict(X)
+                # 加法模型中添加基学习器的预测，得到最新迭代下的加法模型预测
+                Y_pred[:, c] += np.multiply(self.weights[i, c], h_pred)
+                self.trees[i, c] = tree
+
+    def predict(self, X):
+        Y_pred = np.zeros((X.shape[0], self.out_dims))
+        # 生成预测
+        for c in range(self.out_dims):
+            y_pred = np.array([])
+            for i in range(self.n_estimators):
+                update = np.multiply(self.weights[i, c], self.trees[i, c].predict(X))
+                y_pred = update if not y_pred.any() else y_pred + update
+            Y_pred[:, c] = y_pred
+        if not self.is_regression: 
+            # 分类问题输出最可能类别
+            Y_pred = Y_pred.argmax(axis=1)
+        return Y_pred
+    
+    def score(self, X, y):
+        y_pred = self.predict(X)
+        accuracy = np.sum(y == y_pred, axis=0) / len(y)
+        return accuracy
+    
+    
+class XGBRegressor(XGBoost):
+    
+    def __init__(self, n_estimators=200, learning_rate=1, min_samples_split=2,
+                 min_impurity=1e-7, max_depth=float("inf"), is_regression=True,
+                 gamma=0., lambd=0.):
+        super(XGBRegressor, self).__init__(n_estimators=n_estimators, 
+            learning_rate=learning_rate, 
+            min_samples_split=min_samples_split, 
+            min_impurity=min_impurity,
+            max_depth=max_depth,
+            is_regression=is_regression,
+            gamma=gamma,
+            lambd=lambd)
+
+
+class XGBClassifier(XGBoost):
+    
+    def __init__(self, n_estimators=200, learning_rate=1, min_samples_split=2,
+                 min_impurity=1e-7, max_depth=float("inf"), is_regression=False,
+                 gamma=0., lambd=0.):
+        super(XGBClassifier, self).__init__(n_estimators=n_estimators, 
+            learning_rate=learning_rate, 
+            min_samples_split=min_samples_split, 
+            min_impurity=min_impurity,
+            max_depth=max_depth,
+            is_regression=is_regression,
+            gamma=gamma,
+            lambd=lambd)        
diff --git a/contents.txt b/contents.txt
new file mode 100644
index 0000000..77ec27d
--- /dev/null
+++ b/contents.txt
@@ -0,0 +1,205 @@
+注：目录是基于《深度学习》的目录起的。基于本项目的内容，目录其实可以分的更细致，这里就分到目录的第三级为止。
+
+**目录**:
+
+- 第二章 线性代数
+  - 1 标量, 向量, 矩阵, 张量
+  - 2 矩阵转置
+  - 3 矩阵加法
+  - 4 矩阵乘法
+  - 5 单位矩阵
+  - 6 矩阵的逆
+  - 7 范数
+  - 8 特征值分解
+  - 9 奇异值分解
+  - 10 PCA (主成分分析)
+
+
+- 第三章 概率与信息论
+  - 1 概率
+    - 1.1 概率与随机变量
+    - 1.2 概率分布
+      - 1.2.1 概率质量函数
+      - 1.2.2 概率密度函数
+      - 1.2.3 累积分布函数
+    - 1.3 条件概率与条件独立
+    - 1.4 随机变量的度量
+    - 1.5 常用概率分布
+      - 1.5.1 伯努利分布 (两点分布)
+      - 1.5.2 范畴分布 (分类分布)
+      - 1.5.3 高斯分布 (正态分布)
+      - 1.5.4 多元高斯分布 (多元正态分布)
+      - 1.5.5 指数分布
+      - 1.5.6 拉普拉斯分布
+      - 1.5.7 Dirac 分布
+    - 1.6 常用函数的有用性质
+      - 1.6.1 logistic sigmoid 函数
+      - 1.6.2 softplus 函数
+  - 2 信息论
+  - 3 图模型
+    - 3.1 有向图模型
+      - 3.1.1 贝叶斯网的独立性
+    - 3.2 无向图模型
+      - 3.1.2 马尔可夫网的条件独立性
+
+
+- 第四章 数值计算
+  - 1 上溢和下溢
+  - 2 优化方法
+    - 2.1 梯度下降法
+    - 2.2 牛顿法
+    - 2.3 约束优化
+
+
+- 第五章 机器学习基础
+  - 1 学习算法
+    - 1.1 举例:线性回归 
+  - 2 容量、过拟合、欠拟合
+    - 2.1 泛化问题
+    - 2.2 容量
+  - 3 超参数与验证集
+  - 4 偏差和方差
+    - 4.1 偏差
+    - 4.2 方差
+    - 4.3 误差与偏差和方差的关系
+  - 5 最大似然估计
+  - 6 贝叶斯统计
+  - 7 最大后验估计
+    - 7.1 举例:线性回归
+  - 8 监督学习方法
+    - 8.1 概率监督学习
+    - 8.2 支持向量机
+      - 8.2.1 核技巧
+    - 8.3 k-近邻
+    - 8.4 决策树
+      - 8.4.1 特征选择
+      - 8.4.2 决策树生成
+      - 8.4.3 决策树正则化
+  - 9 无监督学习方法
+    - 9.1 主成分分析法
+    - 9.2 k-均值聚类
+
+
+- 第六章 深度前馈网络
+  - 1 深度前馈网络
+  - 2 DFN 相关设计
+    - 2.1 隐藏单元
+    - 2.2 输出单元
+    - 2.3 代价函数
+    - 2.4 架构设计
+  - 3 反向传播算法
+    - 3.1 单个神经元的训练
+    - 3.2 多层神经网络的训练
+      - 3.2.1 定义权重初始化方法
+      - 3.2.2 定义激活函数
+      - 3.2.3 定义优化方法
+      - 3.2.4 定义网络层的框架
+      - 3.2.5 定义代价函数
+      - 3.2.6 定义深度前馈网络
+  - 4 神经网络的万能近似定理
+  - 5 实例:学习 XOR
+
+
+- 第七章 深度学习中的正则化
+  - 1 参数范数惩罚
+    - 1.1 L2 正则化
+    - 1.2 L1 正则化
+    - 1.3 总结 (L2 正则化与L1 正则化的解)
+    - 1.4 作为约束的范数惩罚
+    - 1.5 欠约束问题
+  - 2 数据增强
+    - 2.1 数据集增强
+    - 2.2 噪声鲁棒性
+  - 3 训练方案
+    - 3.1 半监督学习
+    - 3.2 多任务学习
+    - 3.3 提前终止
+  - 4 模型表示
+    - 4.1 参数绑定与共享
+    - 4.2 稀疏表示
+    - 4.3 Bagging 及其他集成方法
+      - 4.3.1 Bagging 方法
+      - 4.3.2 随机森林
+      - 4.3.3 方法解决过拟合
+    - 4.4 Dropout
+  - 5 样本测试
+  - 6 补充材料
+    - 6.1 Boosting
+      - 6.1.1 前向分步加法模型
+      - 6.1.2 AdaBoost 算法
+      - 6.1.3 Boosting Tree 算法与 GBDT 算法
+      - 6.1.4 XGBoost 算法
+
+
+- 第八章 深度模型中的优化
+  - 1 基本优化算法
+    - 1.1 梯度
+      - 1.1.1 梯度下降
+      - 1.1.2 随机梯度下降
+    - 1.2 动量
+      - 1.2.1 Momentum 算法
+      - 1.2.2 NAG 算法
+    - 1.3 自适应学习率
+      - 1.3.1 AdaGrad 算法
+      - 1.3.2 RMSProp 算法
+      - 1.3.3 AdaDelta 算法
+      - 1.3.4 Adam 算法
+    - 1.4 二阶近似方法
+      - 1.4.1 牛顿法
+      - 1.4.2 拟牛顿法
+  - 2 优化策略
+    - 2.1 参数初始化
+  - 3 批标准化
+  - 4 坐标下降
+  - 5 Polyak 平均
+  - 6 监督预训练
+  - 7 设计有助于优化的模型
+
+
+- 第九章 卷积网络
+  - 1 卷积运算
+  - 2 池化
+  - 3 深度学习框架下的卷积
+    - 3.1 多个并行卷积
+    - 3.2 输入值与核
+    - 3.3 填充 (Padding)
+    - 3.4 卷积步幅 (Stride)
+  - 4 更多的卷积策略
+    - 4.1 深度可分离卷积 (Depthwise Separable Convolution)
+    - 4.2 分组卷积 (Group Convolution)
+    - 4.3 扩张卷积 (Dilated Convolution)
+  - 5 GEMM 转换
+  - 6 卷积网络的训练
+    - 6.1 卷积网络示意图
+    - 6.2 单层卷积层/池化层
+      - 6.2.1 卷积函数的导数及反向传播
+      - 6.2.2 池化函数的导数及后向传播
+    - 6.3 多层卷积层/池化层
+    - 6.4 Flatten 层 & 全连接层
+  - 7 平移等变
+  - 8 代表性的卷积神经网络
+    - 8.1 卷积神经网络 (LeNet)
+
+
+- 第十一章 实践方法论
+  - 1 实践方法论
+  - 2 性能度量指标
+    - 2.1 错误率与准确性
+    - 2.2 查准率、查全率与 F1 值
+      - 2.2.1 混淆矩阵
+      - 2.2.2 查准率和查全率的定义与关联
+      - 2.2.3 F1 值
+    - 2.3 PR 曲线
+    - 2.4 ROC 曲线与 AUC 值
+      - 2.4.1 ROC 曲线
+      - 2.4.2 AUC 值的计算方法
+    - 2.5 覆盖
+    - 2.6 指标性能的瓶颈
+  - 3 默认基准模型
+  - 4 确定是否收集更多数据
+  - 5 选择超参数
+    - 5.1 手动超参数调整
+    - 5.2 自动超参数优化算法
+      - 5.2.1 网格搜索 (Grid Search)
+      - 5.2.2 随机搜索 (Random Search)
+      - 5.2.3 基于模型的超参数优化 (Model-based Hyperparameter Optimization)
diff --git a/reference.txt b/reference.txt
index 535107c..9381c60 100644
--- a/reference.txt
+++ b/reference.txt
@@ -34,12 +34,20 @@
   - https://www.zybuluo.com/songying/note/1400484
   - https://zhuanlan.zhihu.com/p/37120298
   - https://kevinzakka.github.io/2016/09/14/batch_normalization/
+  - http://gitlinux.net/2018-10-29-xgboost/
+  - https://medium.com/swlh/boosting-and-bagging-explained-with-examples-5353a36eb78d
+  - http://www.ccs.neu.edu/home/vip/teach/MLcourse/4_boosting/slides/gradient_boosting.pdf
+  - https://blog.csdn.net/liangjun_feng/article/details/79603705
+  - https://blog.csdn.net/sinat_22594309/article/details/60957594
+  - https://www.zybuluo.com/yxd/note/611571
+  - http://freemind.pluskid.org/machine-learning/sparsity-and-some-basics-of-l1-regularization/#ed61992b37932e208ae114be75e42a3e6dc34cb3
 
 - 深度模型中的优化
   - https://zhuanlan.zhihu.com/p/32626442
   - https://github.com/exacity/deeplearningbook-chinese
   - http://cthorey.github.io./backpropagation/
   - http://www.ludoart.cn/2019/02/22/Optimization-Methods/
+  - https://blog.csdn.net/itplus/article/details/21897715
   
 - 卷积神经网络
   - https://www.slideshare.net/kuwajima/cnnbp
@@ -50,4 +58,14 @@
   - https://zh.gluon.ai/chapter_convolutional-neural-networks/lenet.html
   - https://zhuanlan.zhihu.com/p/32702031
   - https://blog.csdn.net/marsjhao/article/details/73088850
+
+- 实践方法论
+  - https://github.com/masakazu-ishihata/BayesianOptimization
+  - https://github.com/bjzhao143/MLwithPython
+  - https://medium.com/inveterate-learner/deep-learning-book-chapter-11-c6ad1d3c3c08
+  - https://www.alexejgossmann.com/auc/
+  - https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-imbalanced-classification/
+  - https://www.yuque.com/books/share/f4031f65-70c1-4909-ba01-c47c31398466/kqbfug
+  - http://bridg.land/posts/gaussian-processes-1
+  - https://zhuanlan.zhihu.com/p/76269142
  
diff --git a/update.txt b/update.txt
new file mode 100644
index 0000000..8e28d9f
--- /dev/null
+++ b/update.txt
@@ -0,0 +1,10 @@
+**更新记录**:
+
+2020/3/：
+
+ 	1. 修改第五章决策树部分，补充 ID3 和 CART 的原理，代码实现以 CART 为主。
+ 	2. 第七章添加 L1 和 L2 正则化最优解的推导 (即 L1稀疏解的原理)。
+ 	3. 第七章添加集成学习方法的推导与代码实现，包括 Bagging (随机森林)、Boosting (Adaboost、GBDT、XGBoost)
+ 	4. 第八章添加牛顿法与拟牛顿法 (DFP、BFGS、L-BFGS) 的推导。
+ 	5. 第十一章节添加高斯过程回归 (GPR) 与贝叶斯优化的推导与代码实现。
+
diff --git "a/\346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200.pdf" "b/\346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200.pdf"
deleted file mode 100644
index 99b59c2..0000000
Binary files "a/\346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200.pdf" and /dev/null differ
diff --git "a/\346\267\261\345\272\246\345\255\246\344\271\240\344\270\255\347\232\204\346\255\243\345\210\231\345\214\226.pdf" "b/\346\267\261\345\272\246\345\255\246\344\271\240\344\270\255\347\232\204\346\255\243\345\210\231\345\214\226.pdf"
deleted file mode 100644
index 4ffc7a8..0000000
Binary files "a/\346\267\261\345\272\246\345\255\246\344\271\240\344\270\255\347\232\204\346\255\243\345\210\231\345\214\226.pdf" and /dev/null differ
diff --git "a/\346\267\261\345\272\246\346\250\241\345\236\213\344\270\255\347\232\204\344\274\230\345\214\226.pdf" "b/\346\267\261\345\272\246\346\250\241\345\236\213\344\270\255\347\232\204\344\274\230\345\214\226.pdf"
deleted file mode 100644
index 5b2c75a..0000000
Binary files "a/\346\267\261\345\272\246\346\250\241\345\236\213\344\270\255\347\232\204\344\274\230\345\214\226.pdf" and /dev/null differ