diff --git a/document/Building.Machine.Learning.Systems.with.Python.pdf b/document/Building.Machine.Learning.Systems.with.Python.pdf
new file mode 100644
index 0000000..b45c8ab
Binary files /dev/null and b/document/Building.Machine.Learning.Systems.with.Python.pdf differ
diff --git a/document/Natural%20Language%20Processing%20with%20Python.pdf b/document/Natural%20Language%20Processing%20with%20Python.pdf
new file mode 100644
index 0000000..e162b1f
Binary files /dev/null and b/document/Natural%20Language%20Processing%20with%20Python.pdf differ
diff --git "a/document/\347\244\276\344\272\244\347\275\221\347\253\231\347\232\204\346\225\260\346\215\256\346\214\226\346\216\230\344\270\216\345\210\206\346\236\220\350\213\261\346\226\207\347\211\210.pdf" "b/document/\347\244\276\344\272\244\347\275\221\347\253\231\347\232\204\346\225\260\346\215\256\346\214\226\346\216\230\344\270\216\345\210\206\346\236\220\350\213\261\346\226\207\347\211\210.pdf"
new file mode 100644
index 0000000..a8059b0
Binary files /dev/null and "b/document/\347\244\276\344\272\244\347\275\221\347\253\231\347\232\204\346\225\260\346\215\256\346\214\226\346\216\230\344\270\216\345\210\206\346\236\220\350\213\261\346\226\207\347\211\210.pdf" differ
diff --git a/notebooks/BagOfWords.ipynb b/notebooks/BagOfWords.ipynb
new file mode 100644
index 0000000..4cb6aa4
--- /dev/null
+++ b/notebooks/BagOfWords.ipynb
@@ -0,0 +1,548 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:63cc16054c19afe420fe419458902fbaf0be248a3a3d7ed28a7185154e78bac1"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from sklearn.feature_extraction.text import CountVectorizer"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from pandas import Series, DataFrame"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from weiboPredict import framework as fw"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fw.loadData()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "train_context_clean = Series.from_csv('data/train_context_clean.csv')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 6
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fw.weibo_train_data['context_clean'] = train_context_clean"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 7
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "vectorizer = CountVectorizer(analyzer = \"word\",   \\\n",
+      "                             tokenizer = None,    \\\n",
+      "                             preprocessor = None, \\\n",
+      "                             stop_words = None,   \\\n",
+      "                             max_features = 100) "
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 8
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_train_data_features = vectorizer.fit_transform(fw.weibo_train_data[(fw.weibo_train_data['time']<'2014-12-01') & \n",
+      "                                                                       (fw.weibo_train_data['time']>='2014-11-01')].context_clean)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 9
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print vectorizer.get_feature_names()[99]"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "u9ad8\n"
+       ]
+      }
+     ],
+     "prompt_number": 12
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print u'\\u9ad8'"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\u9ad8\n"
+       ]
+      }
+     ],
+     "prompt_number": 13
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_train_data_features = off_train_data_features.toarray()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 14
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_train_data_features.shape"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 15,
+       "text": [
+        "(268228, 100)"
+       ]
+      }
+     ],
+     "prompt_number": 15
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from sklearn import linear_model"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 16
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import numpy as np"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 22
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "LR_off_train_forward = linear_model.LinearRegression()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 17
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "LR_off_train_forward.fit(off_train_data_features,fw.weibo_train_data[(fw.weibo_train_data.time<'2014-12-01') & \n",
+      "                                                                       (fw.weibo_train_data['time']>='2014-11-01')].forward_count)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 18,
+       "text": [
+        "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)"
+       ]
+      }
+     ],
+     "prompt_number": 18
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_train_data_forward = fw.weibo_train_data[(fw.weibo_train_data.time<'2014-12-01') & (fw.weibo_train_data['time']>='2014-11-01')].forward_count"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 28
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# The coefficients\n",
+      "print 'Coefficients: \\n', LR_off_train_forward.coef_\n",
+      "# The mean square error\n",
+      "print \"Residual sum of squares: %.2f\" % np.mean((LR_off_train_forward.predict(off_train_data_features) - off_train_data_forward) ** 2)\n",
+      "# Explained variance score: 1 is perfect prediction\n",
+      "print 'Variance score: %.2f' % LR_off_train_forward.score(off_train_data_features, off_train_data_forward)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "Coefficients: \n",
+        "[  1.28186683e+00   2.52042790e+00  -4.22240511e-01   3.00193074e-01\n",
+        "   3.28058189e-01  -2.13529577e-01   3.27640197e-01  -3.49218136e-03\n",
+        "  -7.51687673e-01   4.60758983e-01  -1.88868901e-01  -5.63483055e-01\n",
+        "  -1.83995649e-01   2.93752622e-01  -1.97645411e-01  -1.79253901e-01\n",
+        "  -2.06172987e-03   1.99173963e+00  -6.27022384e-01   6.15917705e-02\n",
+        "   2.85340672e-01  -6.74744046e-01   3.22703508e-01  -1.93906428e-01\n",
+        "  -7.66002106e-02  -3.09497938e-01  -3.45029518e-01   1.27872315e-01\n",
+        "   3.79986469e-01   2.80729951e-01   8.66619194e-01   1.06845509e+00\n",
+        "  -7.46800101e-01  -2.54218115e-01   2.73414376e+00  -1.47288430e-01\n",
+        "  -1.01462568e+00   4.30055528e-01  -2.93930772e-01   3.81736449e-01\n",
+        "  -3.66519904e-01   7.19002162e-01  -1.73325464e-01   3.24875449e-01\n",
+        "  -4.16267259e-01   1.09335789e+00  -5.77318469e-01  -5.27257101e-02\n",
+        "  -9.70816462e-01  -6.26686594e-01  -2.76311183e-01   9.30172159e-01\n",
+        "   9.38379061e-01   1.63664274e-01   4.09739757e-01  -7.65133900e-01\n",
+        "   3.31016884e-01  -2.58172161e-01   1.01198797e+00   1.77307286e+00\n",
+        "   3.61991774e-02  -3.89654803e-01  -1.99074142e-01  -6.28604106e-01\n",
+        "   9.55516554e-01  -7.35720041e-02  -4.58588853e-01  -2.23462541e-01\n",
+        "   2.15590694e-01   8.00243970e-01   1.16599814e-01   2.19498519e+00\n",
+        "  -2.66742669e-01  -4.14278221e-01  -1.95668630e-01  -9.36758394e-01\n",
+        "  -4.17932271e-01  -3.16992248e-01   4.58014954e-01  -5.32611220e-01\n",
+        "  -1.73775996e-01   7.05968914e-01   1.49674861e+00  -5.08237071e-01\n",
+        "  -2.97077601e-01  -4.55922806e-01   2.16940611e-01   5.40526987e-02\n",
+        "  -4.94225407e-01   1.04665258e+00   2.38683146e+00   2.94031871e+00\n",
+        "  -9.84394767e-01   2.98732515e-01  -7.10995426e-02  -4.39575815e-01\n",
+        "  -8.15566261e-01   2.66261832e-01  -4.48605915e-01  -2.35131478e-01]\n",
+        "Residual sum of squares: 4199.03"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Variance score: 0.00"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n"
+       ]
+      }
+     ],
+     "prompt_number": 29
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_test_data_features = vectorizer.transform(fw.weibo_train_data[(fw.weibo_train_data['time']<='2014-12-31') & \n",
+      "                                                                       (fw.weibo_train_data['time']>='2014-12-01')].context_clean)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 20
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_test_data_features = off_test_data_features.toarray()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 21
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_test_data_forward = fw.weibo_train_data[(fw.weibo_train_data.time<='2014-12-31') & (fw.weibo_train_data['time']>='2014-12-01')].forward_count"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 30
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "LR_off_test_forward = linear_model.LinearRegression()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 31
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "LR_off_test_forward.fit(off_test_data_features,off_test_data_forward)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 32,
+       "text": [
+        "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)"
+       ]
+      }
+     ],
+     "prompt_number": 32
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# The coefficients\n",
+      "print 'Coefficients: \\n', LR_off_test_forward.coef_\n",
+      "# The mean square error\n",
+      "print \"Residual sum of squares: %.2f\" % np.mean((LR_off_train_forward.predict(off_test_data_features) - off_test_data_forward) ** 2)\n",
+      "# Explained variance score: 1 is perfect prediction\n",
+      "print 'Variance score: %.2f' % LR_off_train_forward.score(off_test_data_features, off_test_data_forward)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "Coefficients: \n",
+        "[-0.1171739   0.63265743 -0.57442636  1.08746757  0.20182595  0.33553953\n",
+        "  0.81983983 -0.0211989  -0.66764881  0.12984936 -0.24389544  0.36226616\n",
+        "  0.06604472  0.20332698  0.50035616  0.28453848  0.26696837  0.33407466\n",
+        " -0.51198033 -0.79661462  0.39749733 -0.55538544  1.22117418 -0.22280062\n",
+        "  0.27924758  0.06368221 -0.07490052 -0.27374777 -0.32021338 -0.67411311\n",
+        "  0.50933275  0.23655788 -0.45525629  0.0289753   2.67015218 -0.24295205\n",
+        " -0.72936656  0.14092331  0.13797876  0.08975027 -0.09392222  0.5397419\n",
+        " -0.28049526  0.09968362 -0.30238392  1.40514351 -0.19270185  0.05052858\n",
+        " -0.82315137 -0.25508024 -0.44458304  0.84594892  0.21754754 -0.05292221\n",
+        " -0.67396415 -0.30519182  0.85181616  0.55422181 -0.4006845   1.06301328\n",
+        " -0.05069149 -0.3123591  -0.09636032 -0.22507588  0.747482   -0.24339028\n",
+        " -0.63067304 -0.05132197  0.0145011  -0.04456321  0.16510655  1.78731256\n",
+        "  0.45439284 -0.14257295 -0.45809281 -0.37898258 -0.02587705 -0.21514858\n",
+        "  0.52580601 -0.38464043  0.23830032 -0.31960683  2.22860638 -0.53478454\n",
+        " -0.22331493 -0.30997626  0.08812845 -0.2962532   0.06009859 -0.25908015\n",
+        "  1.85063471  3.14683021 -0.51927151  0.13069531 -0.44594174  0.06274401\n",
+        " -0.04162402 -0.61135463 -0.6250505  -0.10684694]\n",
+        "Residual sum of squares: 2902.73"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Variance score: 0.00"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n"
+       ]
+      }
+     ],
+     "prompt_number": 33
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_predict_forward = LR_off_train_forward.predict(off_test_data_features)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 35
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_test_data_forward[20:40]"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 55,
+       "text": [
+        "73     0\n",
+        "81     0\n",
+        "82     0\n",
+        "119    0\n",
+        "120    0\n",
+        "121    0\n",
+        "122    0\n",
+        "123    0\n",
+        "124    0\n",
+        "183    0\n",
+        "300    0\n",
+        "301    0\n",
+        "302    2\n",
+        "303    1\n",
+        "304    0\n",
+        "305    0\n",
+        "306    0\n",
+        "307    0\n",
+        "308    0\n",
+        "309    2\n",
+        "Name: forward_count, dtype: int64"
+       ]
+      }
+     ],
+     "prompt_number": 55
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "Series(off_predict_forward)[20:40].round()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 58,
+       "text": [
+        "20    1\n",
+        "21    0\n",
+        "22    1\n",
+        "23    0\n",
+        "24    2\n",
+        "25    3\n",
+        "26   -0\n",
+        "27   -0\n",
+        "28   -0\n",
+        "29    2\n",
+        "30    4\n",
+        "31    2\n",
+        "32    6\n",
+        "33   -6\n",
+        "34   -3\n",
+        "35    5\n",
+        "36   -1\n",
+        "37    4\n",
+        "38    3\n",
+        "39    2\n",
+        "dtype: float64"
+       ]
+      }
+     ],
+     "prompt_number": 58
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "off_test_data_forward.count()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 49,
+       "text": [
+        "272702"
+       ]
+      }
+     ],
+     "prompt_number": 49
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
diff --git a/DataExploration.ipynb b/notebooks/DataExploration.ipynb
similarity index 100%
rename from DataExploration.ipynb
rename to notebooks/DataExploration.ipynb
diff --git a/fenci.ipynb b/notebooks/fenci.ipynb
similarity index 86%
rename from fenci.ipynb
rename to notebooks/fenci.ipynb
index d9c08ae..398e87d 100644
--- a/fenci.ipynb
+++ b/notebooks/fenci.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:19e166f8649d87303d678d94653359d7d52be6a0921728c3551435df422231f4"
+  "signature": "sha256:70462ecc59f754aafe6fdafca0cc9b261a08bcd8adcbd9fe4302ae319a3de649"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -5214,7 +5214,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 268
+     "prompt_number": 7
     },
     {
      "cell_type": "code",
@@ -5228,13 +5228,13 @@
       {
        "metadata": {},
        "output_type": "pyout",
-       "prompt_number": 284,
+       "prompt_number": 11,
        "text": [
-        "<module 'weiboPredict.framework' from 'weiboPredict/framework.pyc'>"
+        "<module 'weiboPredict.framework' from 'weiboPredict/framework.py'>"
        ]
       }
      ],
-     "prompt_number": 284
+     "prompt_number": 11
     },
     {
      "cell_type": "code",
@@ -5245,7 +5245,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 282
+     "prompt_number": 1
     },
     {
      "cell_type": "code",
@@ -5278,7 +5278,7 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 286
+     "prompt_number": 2
     },
     {
      "cell_type": "code",
@@ -5288,8 +5288,65 @@
      ],
      "language": "python",
      "metadata": {},
-     "outputs": [],
-     "prompt_number": 288
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "Building prefix dict from /Users/Pro/canopy/lib/python2.7/site-packages/jieba/dict.txt ...\n"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "DEBUG:jieba:Building prefix dict from /Users/Pro/canopy/lib/python2.7/site-packages/jieba/dict.txt ...\n"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "Loading model from cache /var/folders/rv/scpt0ywn277bqhjg160ms0d40000gp/T/jieba.cache\n"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "DEBUG:jieba:Loading model from cache /var/folders/rv/scpt0ywn277bqhjg160ms0d40000gp/T/jieba.cache\n"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "Loading model cost 0.556 seconds.\n"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "DEBUG:jieba:Loading model cost 0.556 seconds.\n"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "Prefix dict has been built succesfully.\n"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "DEBUG:jieba:Prefix dict has been built succesfully.\n"
+       ]
+      }
+     ],
+     "prompt_number": 3
     },
     {
      "cell_type": "code",
@@ -5308,7 +5365,7 @@
        ]
       }
      ],
-     "prompt_number": 295
+     "prompt_number": 4
     },
     {
      "cell_type": "code",
@@ -5473,7 +5530,1578 @@
      ],
      "language": "python",
      "metadata": {},
-     "outputs": []
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "10000/1626750\n",
+        "20000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "30000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "40000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "50000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "60000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "70000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "80000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "90000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "100000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "110000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "120000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "130000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "140000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "150000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "160000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "170000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "180000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "190000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "200000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "210000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "220000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "230000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "240000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "250000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "260000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "270000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "280000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "290000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "300000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "310000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "320000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "330000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "340000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "350000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "360000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "370000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "380000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "390000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "400000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "410000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "420000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "430000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "440000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "450000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "460000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "470000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "480000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "490000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "500000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "510000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "520000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "530000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "540000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "550000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "560000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "570000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "580000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "590000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "600000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "610000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "620000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "630000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "640000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "650000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "660000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "670000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "680000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "690000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "700000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "710000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "720000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "730000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "740000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "750000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "760000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "770000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "780000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "790000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "800000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "810000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "820000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "830000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "840000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "850000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "860000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "870000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "880000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "890000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "900000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "910000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "920000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "930000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "940000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "950000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "960000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "970000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "980000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "990000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1000000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1010000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1020000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1030000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1040000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1050000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1060000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1070000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1080000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1090000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1100000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1110000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1120000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1130000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1140000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1150000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1160000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1170000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1180000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1190000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1200000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1210000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1220000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1230000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1240000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1250000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1260000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1270000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1280000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1290000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1300000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1310000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1320000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1330000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1340000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1350000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1360000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1370000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1380000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1390000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1400000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1410000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1420000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1430000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1440000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1450000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1460000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1470000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1480000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1490000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1500000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1510000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1520000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1530000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1540000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1550000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1560000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1570000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1580000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1590000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1600000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1610000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "1620000/1626750"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n"
+       ]
+      }
+     ],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "predict_clean_text = fw.cleanText(fw.weibo_predict_data.context)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "10000/275331\n",
+        "20000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "30000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "40000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "50000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "60000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "70000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "80000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "90000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "100000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "110000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "120000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "130000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "140000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "150000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "160000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "170000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "180000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "190000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "200000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "210000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "220000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "230000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "240000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "250000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "260000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "270000/275331"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n"
+       ]
+      }
+     ],
+     "prompt_number": 6
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "clean_context.head()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 8,
+       "text": [
+        "0    [\u5410, \u4eac\u4e1c, \u7528\u6237, \u4f53\u9a8c, \u592a\u5dee, ...., \u7b2c\u4e00\u6b21, \u4eac\u4e1c, \u8d2d\u7269, \u4e70, \u51e0\u767e, ...\n",
+        "1    [\u7231, \u5c0f\u7c73, \u624b\u673a, \u5c0f\u7c73, \u624b\u673a, \u5c4f\u5e55, \u592a\u7ed9\u529b, \u82f1\u5bf8, \u9ad8, \u8272\u5f69, \u9971\u548c\u5ea6, \u590f...\n",
+        "2    [doge, \u55b5, \u55b5, \u5929\u732b, \u771f\u662f, \u725b, ...., \u5356, \u5b9e\u9a8c, \u8017\u6750, \u8dd1, \u5929\u732b...\n",
+        "3                         [doge, \u627e, \u5de5\u4f5c, \u256e, \u256f, \u25bd, \u2570, \u256d]\n",
+        "4                                [doge, \u6210\u529f, \u62a2, \u53f0\u7c73, \u7535\u4fe1]\n",
+        "dtype: object"
+       ]
+      }
+     ],
+     "prompt_number": 8
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "clean_context.to_csv('data/train_context_clean.csv')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 9
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "predict_clean_text.to_csv('data/predict_context_clean.csv')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 10
     },
     {
      "cell_type": "code",
diff --git a/weiboPredict/framework.py b/weiboPredict/framework.py
index 5bb3731..6abbbfe 100644
--- a/weiboPredict/framework.py
+++ b/weiboPredict/framework.py
@@ -2,9 +2,13 @@
 # Usage: basic operations
 # Author: Chen Li
 import pandas as pd
+import numpy as np
 import csv
 import re
 import jieba
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn import linear_model
+from sklearn.externals import joblib
 
 weibo_train_data = None
 weibo_predict_data = None
@@ -36,6 +40,7 @@ def cleanText(contexts):
 		stopwords[i] = stopwords[i].decode('utf8')
 	f.close()
 
+	i=0
 	cleans = []
 	for context in contexts:
 	    context = re.sub("http://.*\w$","",context)
@@ -46,7 +51,46 @@ def cleanText(contexts):
 	    text = jieba.lcut(context)
 	    clean = [t for t in text if t not in stopwords]
 	    cleans.append(clean)
+	    i=i+1
+	    if i%10000==0:
+	    	print str(i)+'/'+str(len(contexts))
 	return pd.Series(cleans)
 
+def train(start,end,label,feature_type,model_type):
+	global weibo_train_data
+	train_context_clean = Series.from_csv('data/train_context_clean.csv')
+	weibo_train_data['context_clean'] = train_context_clean
+	if model_type=="LR":
+		vectorizer = CountVectorizer(analyzer = "word",   \
+                             tokenizer = None,    \
+                             preprocessor = None, \
+                             stop_words = None,   \
+                             max_features = 100) 
+		train_features = vectorizer.fit_transform(								/
+							weibo_train_data[(weibo_train_data['time']<=end) 		/
+							& (fw.weibo_train_data['time']>=start)].context_clean)
+		train_features = train_features.toarray()
+		train_labels = weibo_train_data[(weibo_train_data['time']<=end) 		/
+							& (fw.weibo_train_data['time']>=start)][label]
+
+		model = linear_model.LinearRegression()
+		model.fit(train_features,train_labels)
+		print '====='+feature_type+'_'+model_type+'====='
+	# The coefficients
+	print 'Coefficients: \n', model.coef_
+	# The mean square error
+	print "Residual sum of squares: %.2f" % /
+		np.mean((model.predict(train_features) - train_labels) ** 2)
+	# Explained variance score: 1 is perfect prediction
+	print 'Variance score: %.2f' % model.score(train_features, train_labels)
+
+	joblib.dump(model,feature_type+'_'+model_type+'_'+start+'_'+end+'.model')
+	return model
+
+
+
+
+
+
 if __name__ == "__main__":
 	loadData()
\ No newline at end of file