update

vivienzou1 · Sep 3, 2018 · 3555729 · 3555729
1 parent 07bed98
commit 3555729
Show file tree

Hide file tree

Showing 50 changed files with 1,361 additions and 190 deletions.
diff --git a/招聘要求.md → Jobs.md b/招聘要求.md → Jobs.md
diff --git a/ToDo.md b/ToDo.md
@@ -42,7 +42,75 @@ Updates Log
 - 机器学习实战-部分代码（K-Means等）
 - 判断相似二叉树及优化
 - `max(x, y)` 的期望
-
+- n个[0,n)的数，求每个数的出现次数（不能开辟额外空间）
+- 加速网络收敛的方法
+- 回归树、基尼指数
+- 长短地址转换
+- 直方图蓄水问题
+  ```
+  int foo(vector<int> ns) {
+      
+      vector<vector<int> > dp(ns.size(), vector<int>(2, 0));
+      
+      for (int i = 1; i<ns.size()-1; i++) {
+          int lo = ns[i], hi = ns[i];
+          for (int j=0; j < i; j++) {
+              if (ns[j] > lo)
+                  lo = max(lo, ns[j]);
+          }
+          dp[i][0] = lo;
+          
+          for (int k = i+1; k < ns.size(); k++) {
+              if (ns[k] > hi)
+                  hi = max(hi, ns[k]);
+          }
+          dp[i][1] = hi;
+      }
+      
+      int ret = 0;
+      for (int i=1; i < ns.size()-1; i++) {
+          int mx = min(dp[i][0], dp[i][1]);
+          if (mx > ns[i])
+              ret += mx - ns[i];
+      }
+      
+      return ret;
+  }
+
+  int bar(vector<int> ns) {
+      
+      int n = ns.size();
+      vector<int> dp_fw(ns);
+      vector<int> dp_bw(ns);
+      
+      int lo = ns[0];
+      for (int i=1; i<n; i++) {
+          dp_fw[i] = max(dp[i-1], dp[i])
+      }
+      
+      for (int i=n-2; i>=0; i--) {
+          dp[i] = max(dp[i+1], dp[i]);
+      }
+      
+      int ret = 0;
+      for (int i=1; i < n-1; i++) {
+          int mx = min(dp_fw[i], dp_bw[i]);
+          ret += mx - ns[i];
+      }
+  }
+  ```
+- 判断两个链表是否相交（CSDN）
+
+## 2018-9-3
+- 深度学习基础 - 正则化 - Batch Normalization（修订）
+
+## 2018-9-2
+- 集成学习专题整理
+
+## 2018-9-1
+- 笔试面经 - 头条 4 面小结
+- 算法 - 字符串 - 进制转换/长短地址转换
+- 算法 - LeetCode - 连续子数组和
 
 ## 2018-8-31
 - 数据结构 - 字符串 - 中缀表达式转后缀表达式（逆波兰式）
@@ -126,7 +194,7 @@ Updates Log
 
 ## 2018-8-20
 - DP-鹰蛋问题
-- DL-专题-优化算法
+- 基础-术语表
   - 指数衰减平均的说明
 - LeetCode
   - 数据-双指针

diff --git a/assets/TIM截图20180903222433.png b/assets/TIM截图20180903222433.png
diff --git a/assets/TIM截图20180903224842.png b/assets/TIM截图20180903224842.png
diff --git a/assets/公式_20180902220459.png b/assets/公式_20180902220459.png
diff --git a/assets/公式_20180903203229.png b/assets/公式_20180903203229.png
diff --git a/assets/公式_20180903210625.png b/assets/公式_20180903210625.png
diff --git a/assets/公式_20180903212935.png b/assets/公式_20180903212935.png
diff --git a/assets/公式_20180903213109.png b/assets/公式_20180903213109.png
diff --git a/assets/公式_20180903213410.png b/assets/公式_20180903213410.png
diff --git a/assets/公式_20180903220828.png b/assets/公式_20180903220828.png
diff --git a/assets/公式_20180903223427.png b/assets/公式_20180903223427.png
diff --git a/assets/公式_20180903224323.png b/assets/公式_20180903224323.png
diff --git a/assets/公式_20180903224557.png b/assets/公式_20180903224557.png
diff --git a/code/algorithm/Viterbi.py b/code/algorithm/Viterbi.py
@@ -0,0 +1,27 @@
+Weather = ('Rainy', 'Sunny')
+Activity = ('walk', 'shop', 'clean')
+
+obs = list(range(len(Activity)))  # 观测序列
+states_h = list(range(len(Weather)))  # 隐状态
+
+# 初始概率（隐状态）
+start_p = [0.6, 0.4]
+# 转移概率（隐状态）
+trans_p = [[0.7, 0.3],
+           [0.4, 0.6]]
+# 发射概率（隐状态表现为显状态的概率）
+emit_p = [[0.1, 0.4, 0.5],
+          [0.6, 0.3, 0.1]]
+
+
+def viterbi(obs, states_h, start_p, trans_p, emit_p):
+    """维特比算法"""
+    dp = [[0.0] * len(states_h)] * len(obs)
+    path = [[0] * len(obs)] * len(states_h)
+
+    # 初始化
+    for i in start_p:
+        dp[0][i] = states_h[i] * emit_p[i][obs[0]]
+        path[i][0] = i
+
+
diff --git a/code/my_tensorflow/src/layers/cnn.py b/code/my_tensorflow/src/layers/cnn.py
@@ -6,7 +6,7 @@
 import tensorflow as tf
 
 from ..activations import relu
-from ..utils import get_wb
+from ..utils import get_wb, get_shape
 
 
 # TODO(huay)
@@ -21,8 +21,8 @@ def conv2d(x, kernel_size, out_channels,
            name=None,
            reuse=None):
     """2-D 卷积层
-    Input shape:  [batch_size, in_w, in_h, in_channels]
-    Output shape: [batch_size, out_w, out_h, out_channels]
+    Input shape:  [batch_size, in_h, in_w, in_channels]
+    Output shape: [batch_size, out_h, out_w, out_channels]
 
     Args:
         x(tf.Tensor):
@@ -45,8 +45,8 @@ def conv2d(x, kernel_size, out_channels,
     assert len(kernel_size) == 2
     assert len(strides) == 4
 
-    in_channels = int(x.get_shape()[-1])
-    kernel_shape = list(kernel_size) + [in_channels, out_channels]
+    in_channels = get_shape(x)[-1]
+    kernel_shape = list(kernel_size) + [in_channels, out_channels]  # [kernel_h, kernel_w, in_channels, out_channels]
 
     with tf.variable_scope(name or "conv2d", reuse=reuse):
         W, b = get_wb(kernel_shape)

diff --git a/code/my_tensorflow/src/layers/match/attention_flow.py b/code/my_tensorflow/src/layers/match/attention_flow.py
@@ -91,7 +91,7 @@ def attention_flow(h, u, T=None, J=None, d=None, name=None, reuse=None):
         W_s = get_w([3 * d, 1])  # [3d, 1]
 
         # similarity matrix
-        S = tf.reshape(tf.einsum("ntjd,do->ntjo", h_u_hu, W_s), [-1, T, J])
+        S = tf.reshape(tf.einsum("ntjd,do->ntjo", h_u_hu, W_s), [-1, T, J])  # [N, T, J]
         # 以上操作等价于
         # S = tf.reshape(tf.matmul(tf.reshape(h_u_hu, [-1, 3*d]), W_s), [-1, T, J])
 

diff --git a/code/my_tensorflow/src/utils/__init__.py b/code/my_tensorflow/src/utils/__init__.py
@@ -40,7 +40,7 @@ def get_wb(shape,
            w_initializer=truncated_normal,
            b_initializer=zeros,
            w_regularizer=l2_regularizer,
-           b_regularizer=l2_regularizer,
+           b_regularizer=None,  # 一般不对偏置做权重惩罚，可能会导致欠拟合
            name=None):
     """"""
     name = "" if name is None else name + '_'

diff --git a/papers/[2006]-CNN内部实现（Caffe）.pdf → papers/[2006].CNN内部实现（Caffe）.pdf b/papers/[2006]-CNN内部实现（Caffe）.pdf → papers/[2006].CNN内部实现（Caffe）.pdf
diff --git a/papers/[2014]-GRU.pdf → papers/[2014].GRU.pdf b/papers/[2014]-GRU.pdf → papers/[2014].GRU.pdf
diff --git a/papers/[2015].Batch_Normalization.v3.pdf b/papers/[2015].Batch_Normalization.v3.pdf
diff --git a/papers/[2015]-CharCNN.pdf → papers/[2015].CharCNN.pdf b/papers/[2015]-CharCNN.pdf → papers/[2015].CharCNN.pdf
diff --git a/papers/[2015]-Highway.pdf → papers/[2015].Highway.pdf b/papers/[2015]-Highway.pdf → papers/[2015].Highway.pdf
diff --git a/papers/[2015]-PointerNet（指针网络）.v2.pdf → papers/[2015].PointerNet（指针网络）.v2.pdf b/papers/[2015]-PointerNet（指针网络）.v2.pdf → papers/[2015].PointerNet（指针网络）.v2.pdf
diff --git a/papers/[2015]-序列标注（Bi-LSTM+CRF）.pdf → papers/[2015].序列标注（Bi-LSTM+CRF）.pdf b/papers/[2015]-序列标注（Bi-LSTM+CRF）.pdf → papers/[2015].序列标注（Bi-LSTM+CRF）.pdf
diff --git a/project/NLP-事实类问答评测.md b/project/NLP-事实类问答评测.md
@@ -0,0 +1,40 @@
+NLP-事实类问答评测
+===
+
+Index
+---
+<!-- TOC -->
+
+- [任务描述](#任务描述)
+- [基础模型 - BiDAF](#基础模型---bidaf)
+
+<!-- /TOC -->
+
+## 任务描述
+- 针对每个问题 q，给定与之对应的若干候选答案篇章 a1，a2，…，an，要求设计算法从候选篇章中**抽取合适的词语、短语或句子**，形成一段正确、完整、简洁的文本，作为预测答案 apred，目标是 apred 能够正确、完整、简洁地回答问题 q。
+
+- **示例**
+  ```
+  问题: 中国最大的内陆盆地是哪个
+  答案：塔里木盆地
+  材料：
+    1. 中国新疆的塔里木盆地，是世界上最大的内陆盆地，东西长约1500公里，南北最宽处约600公里。盆地底部海拔1000米左右，面积53万平方公里。
+    2. 中国最大的固定、半固定沙漠天山与昆仑山之间又有塔里木盆地，面积53万平方公里，是世界最大的内陆盆地。盆地中部是塔克拉玛干大沙漠，面积33.7万平方公里，为世界第二大流动性沙漠。
+  ```
+
+- **数据下载**
+  - [CIPS-SOGOU问答比赛](http://task.www.sogou.com/cips-sogou_qa/) （少量）
+  - [百度 WebQA V2.0](http://ai.baidu.com/broad/download)
+  - [百度 WebQA V1.0 预处理版](https://pan.baidu.com/s/1SADkZjF7kdH2Qk37LTdXKw)（密码: kc2q）
+    > [【语料】百度的中文问答数据集WebQA](https://spaces.ac.cn/archives/4338) - 科学空间|Scientific Spaces 
+
+
+## 基础模型 - BiDAF
+> [1611.01603] [Bidirectional Attention Flow for Machine Comprehension](https://arxiv.org/abs/1611.01603) 
+
+**5/6 层模型结构**
+1. 嵌入层（字+词）
+1. Encoder 层
+1. Attention 交互层
+1. Decoder 层
+1. 输出层
diff --git a/project/ref/[2009].The_BellKor_Solution_to_the_Netflix_Grand_Prize.pdf b/project/ref/[2009].The_BellKor_Solution_to_the_Netflix_Grand_Prize.pdf
diff --git a/机器学习-深度学习-NLP/Base-A-术语表.md b/机器学习-深度学习-NLP/Base-A-术语表.md
@@ -0,0 +1,41 @@
+术语表
+===
+
+Index
+---
+<!-- TOC -->
+
+- [指数加权平均（指数衰减平均）](#指数加权平均指数衰减平均)
+  - [偏差修正](#偏差修正)
+
+<!-- /TOC -->
+
+## 指数加权平均（指数衰减平均）
+> [什么是指数加权平均、偏差修正？ - 郭耀华](http://www.cnblogs.com/guoyaohua/p/8544835.html) - 博客园 
+- **加权平均**
+  - 假设 `θi` 的权重分别为 `ρi`，则 `θi` 的加权平均为：
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\fn_jvn&space;v=\sum_{i=1}^t\rho_i\theta_i,\quad&space;where\&space;\sum_{i=1}^t\rho_i=1"><img src="../assets/公式_20180903213109.png" height="" /></a></div>
+
+- **指数加权平均**
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\fn_jvn&space;\large&space;v_t=\rho&space;v_{t-1}&plus;(1-\rho)\theta_t"><img src="../assets/公式_20180903203229.png" height="" /></a></div>
+
+  > 注意到越久前的记录其权重呈**指数衰减**，因此指数加权平均也称**指数衰减平均**
+- **示例**：设 `ρ=0.9, v0=0`
+
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\fn_jvn&space;\begin{aligned}&space;v_t&=0.1\theta_t&plus;0.9{\color{Red}v_{t-1}}\\&space;&=0.1\theta_t&plus;0.1*0.9\theta_{t-1}&plus;0.9^2{\color{Red}v_{t-2}}\\&space;&=0.1\theta_t&plus;0.1*0.9\theta_{t-1}&plus;0.1*0.9^2\theta_{t-2}&plus;\cdots&plus;0.1*0.9^{t-1}\theta_1&space;\end{aligned}"><img src="../assets/公式_20180903210625.png" height="" /></a></div>
+
+  > 其中 `v_t` 可以**近似**认为是最近 `1/1-ρ` 个值的滑动平均（`ρ=0.9`时，`0.1 * 0.9^9 ≈ 0.038`），更久前的记录其权重已近似为 0。
+
+### 偏差修正
+- 指数加权平均在前期会存在较大的**误差**
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\fn_jvn&space;\sum_{i=1}^t0.1*0.9^{i-1}=0.1\cdot\frac{1-0.9^t}{1-0.9}=1-0.9^t"><img src="../assets/公式_20180903212935.png" height="" /></a></div>
+
+  - 注意到只有当 `t -> ∞` 时，所有权重的和才接近 1，当 `t` 比较小时，并不是标准的加权平均
+- **示例**：设 `ρ=0.9, v0=0`
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\fn_jvn&space;\begin{aligned}&space;v_t&=0.1\theta_t&plus;0.9{\color{Red}v_{t-1}}\\&space;&=0.1\theta_t&plus;0.1*0.9\theta_{t-1}&plus;0.9^2{\color{Red}v_{t-2}}\\&space;&=0.1\theta_t&plus;0.1*0.9\theta_{t-1}&plus;0.1*0.9^2\theta_{t-2}&plus;\cdots&plus;0.1*0.9^{t-1}\theta_1&space;\end{aligned}"><img src="../assets/公式_20180903210625.png" height="" /></a></div>
+
+  - 当 `t` 较小时，与希望的加权平均结果差距较大
+- **引入偏差修正**
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\fn_jvn&space;\large&space;\frac{v_t}{1-\rho^t}"><img src="../assets/公式_20180903213410.png" height="" /></a></div>
+
+  - 偏差修正只对**前期**的有修正效果，**后期**当 `t` 逐渐增大时 `1-ρ^t -> 1`，将不再影响 `v_t`，与期望相符
diff --git a/机器学习-深度学习-NLP/Base-B-专题-工具库.md → 机器学习-深度学习-NLP/Base-B-工具库.md b/机器学习-深度学习-NLP/Base-B-专题-工具库.md → 机器学习-深度学习-NLP/Base-B-工具库.md
diff --git a/机器学习-深度学习-NLP/DL-A-深度学习基础.md b/机器学习-深度学习-NLP/DL-A-深度学习基础.md
@@ -34,6 +34,9 @@ Index
   - [Batch Normalization（批标准化）](#batch-normalization批标准化)
     - [动机](#动机)
     - [基本原理](#基本原理)
+    - [BN 在训练和测试时分别是怎么做的？](#bn-在训练和测试时分别是怎么做的)
+      - [为什么训练时不采用移动平均？](#为什么训练时不采用移动平均)
+    - [相关阅读](#相关阅读)
   - [L1/L2 范数正则化](#l1l2-范数正则化)
     - [L1/L2 范数的作用、异同](#l1l2-范数的作用异同)
     - [为什么 L1 和 L2 正则化可以防止过拟合？](#为什么-l1-和-l2-正则化可以防止过拟合)
@@ -57,6 +60,7 @@ Index
   - 反映在**评价指标**上，就是模型在训练集上表现良好，但是在测试集和新数据上表现一般（**泛化能力差**）；
 
 ## 降低过拟合风险的方法
+> 所有为了**减少测试误差**的策略统称为**正则化方法**，这些方法可能会以增大训练误差为代价。
 
 - **数据增强**
   - 图像：平移、旋转、缩放
@@ -250,7 +254,10 @@ Index
 # 正则化
 
 ## Batch Normalization（批标准化）
-- BN 是一种正则化方法，目的是**加速**网络的训练，并**防止过拟合**。
+- BN 是一种**正则化**方法（减少泛化误差），主要作用有：
+  - **加速网络的训练**（缓解梯度消失，支持更大的学习率）
+  - **防止过拟合**
+  - 降低了**参数初始化**的要求。
 
 ### 动机
 - **训练的本质是学习数据分布**。如果训练数据与测试数据的分布不同会**降低**模型的**泛化能力**。因此，应该在开始训练前对所有输入数据做归一化处理。
@@ -267,7 +274,45 @@ Index
   <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\fn_jvn&space;\large&space;y_k\leftarrow&space;\gamma&space;\hat{x}_k&plus;\beta"><img src="../assets/公式_20180831165516.png" height="" /></a></div>
 
   其中 `γ` 和 `β` 为可训练参数。
-
+
+**小结**
+- 以上过程可归纳为一个 **`BN(x)` 函数**：
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\large&space;\boldsymbol{y_i}=\mathrm{BN}(\boldsymbol{x_i})"><img src="../assets/公式_20180903223427.png" height="" /></a></div>
+
+  其中
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\large&space;\begin{aligned}&space;\mathrm{BN}(\boldsymbol{x_i})&=\gamma\boldsymbol{\hat{x}_i}&plus;\beta\\&space;&=\gamma\frac{\boldsymbol{x_i}-\boldsymbol{\mathrm{E}[x_i]}}{\sqrt{\boldsymbol{\mathrm{Var}[x_i]}&plus;\epsilon}}&plus;\beta&space;\end{aligned}"><img src="../assets/公式_20180903224323.png" height="" /></a></div>
+
+- **完整算法**：
+  <div align="center"><img src="../assets/TIM截图20180903222433.png" height="" /></div>
+
+### BN 在训练和测试时分别是怎么做的？
+- **训练时**每次会传入一批数据，做法如前述；
+- 当**测试**或**预测时**，每次可能只会传入**单个数据**，此时模型会使用**全局统计量**代替批统计量；
+  - 训练每个 batch 时，都会得到一组`（均值，方差）`；
+  - 所谓全局统计量，就是对这些均值和方差求其对应的数学期望；
+  - 具体计算公式为：
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\fn_jvn&space;\large&space;y_k\leftarrow&space;\gamma&space;\hat{x}_k&plus;\beta"><img src="../assets/公式_20180903220828.png" height="" /></a></div>
+
+  > 其中 `μ_i` 和 `σ_i` 分别表示第 i 轮 batch 保存的均值和标准差；`m` 为 batch_size，系数 `m/(m-1)` 用于计算**无偏方差估计**
+  >> 原文称该方法为**移动平均**（moving averages）
+
+- 此时，`BN(x)` 调整为：
+  <div align="center"><a href="http://www.codecogs.com/eqnedit.php?latex=\large&space;\begin{aligned}&space;\mathrm{BN}(\boldsymbol{x_i})&=\gamma\frac{\boldsymbol{x_i}-\boldsymbol{\mathrm{E}[x_i]}}{\sqrt{\boldsymbol{\mathrm{Var}[x_i]}&plus;\epsilon}}&plus;\beta\\&space;&=\frac{\gamma}{\sqrt{\boldsymbol{\mathrm{Var}[x_i]}&plus;\epsilon}}\boldsymbol{x_i}&plus;\left&space;(&space;\beta-\frac{\gamma\boldsymbol{\mathrm{E}[x_i]}}{\sqrt{\boldsymbol{\mathrm{Var}[x_i]}&plus;\epsilon}}&space;\right&space;)&space;\end{aligned}"><img src="../assets/公式_20180903224557.png" height="" /></a></div>
+
+- **完整算法**：
+  <div align="center"><img src="../assets/TIM截图20180903224842.png" height="" /></div>
+
+#### 为什么训练时不采用移动平均？
+> 群里一位同学的面试题
+- 使用 BN 的目的就是为了保证每批数据的分布稳定，使用全局统计量反而违背了这个初衷；
+- BN 的作者认为在训练时采用移动平均可能会与梯度优化存在冲突；
+  > 【**原文**】"It is natural to ask whether we could simply **use the moving averages** µ, σ to perform the normalization **during training**, since this would remove the dependence of the normalized activations on the other example in the minibatch. This, however, has been observed to lead to the model blowing up. As argued in [6], such use of moving averages would cause the gradient optimization and the normalization to counteract each other. For example, the gradient step may increase a bias or scale the convolutional weights, in spite of the fact that the normalization would cancel the effect of these changes on the loss. This would result in unbounded growth of model parameters without actually improving the loss. It is thus crucial to use the minibatch moments, and to backpropagate through them."
+  >> [1702.03275] [Batch Renormalization](https://arxiv.org/abs/1702.03275)
+
+### 相关阅读
+- [深入理解Batch Normalization批标准化 - 郭耀华](https://www.cnblogs.com/guoyaohua/p/8724433.html) - 博客园 
+- [深度学习中批归一化的陷阱](http://ai.51cto.com/art/201705/540230.htm) - 51CTO
+
 
 ## L1/L2 范数正则化
 > 《深度学习》 7.1.1 L2 参数正则化 & 7.1.2 - L1 参数正则化