diff --git a/document/Building.Machine.Learning.Systems.with.Python.pdf b/document/Building.Machine.Learning.Systems.with.Python.pdf new file mode 100644 index 0000000..b45c8ab Binary files /dev/null and b/document/Building.Machine.Learning.Systems.with.Python.pdf differ diff --git a/document/Natural%20Language%20Processing%20with%20Python.pdf b/document/Natural%20Language%20Processing%20with%20Python.pdf new file mode 100644 index 0000000..e162b1f Binary files /dev/null and b/document/Natural%20Language%20Processing%20with%20Python.pdf differ diff --git "a/document/\347\244\276\344\272\244\347\275\221\347\253\231\347\232\204\346\225\260\346\215\256\346\214\226\346\216\230\344\270\216\345\210\206\346\236\220\350\213\261\346\226\207\347\211\210.pdf" "b/document/\347\244\276\344\272\244\347\275\221\347\253\231\347\232\204\346\225\260\346\215\256\346\214\226\346\216\230\344\270\216\345\210\206\346\236\220\350\213\261\346\226\207\347\211\210.pdf" new file mode 100644 index 0000000..a8059b0 Binary files /dev/null and "b/document/\347\244\276\344\272\244\347\275\221\347\253\231\347\232\204\346\225\260\346\215\256\346\214\226\346\216\230\344\270\216\345\210\206\346\236\220\350\213\261\346\226\207\347\211\210.pdf" differ diff --git a/notebooks/BagOfWords.ipynb b/notebooks/BagOfWords.ipynb new file mode 100644 index 0000000..4cb6aa4 --- /dev/null +++ b/notebooks/BagOfWords.ipynb @@ -0,0 +1,548 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:63cc16054c19afe420fe419458902fbaf0be248a3a3d7ed28a7185154e78bac1" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.feature_extraction.text import CountVectorizer" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from pandas import Series, DataFrame" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from weiboPredict import framework as fw" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 3 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fw.loadData()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "train_context_clean = Series.from_csv('data/train_context_clean.csv')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fw.weibo_train_data['context_clean'] = train_context_clean" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "vectorizer = CountVectorizer(analyzer = \"word\", \\\n", + " tokenizer = None, \\\n", + " preprocessor = None, \\\n", + " stop_words = None, \\\n", + " max_features = 100) " + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 8 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_train_data_features = vectorizer.fit_transform(fw.weibo_train_data[(fw.weibo_train_data['time']<'2014-12-01') & \n", + " (fw.weibo_train_data['time']>='2014-11-01')].context_clean)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print vectorizer.get_feature_names()[99]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "u9ad8\n" + ] + } + ], + "prompt_number": 12 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print u'\\u9ad8'" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\u9ad8\n" + ] + } + ], + "prompt_number": 13 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_train_data_features = off_train_data_features.toarray()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 14 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_train_data_features.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 15, + "text": [ + "(268228, 100)" + ] + } + ], + "prompt_number": 15 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn import linear_model" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 16 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import numpy as np" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 22 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "LR_off_train_forward = linear_model.LinearRegression()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 17 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "LR_off_train_forward.fit(off_train_data_features,fw.weibo_train_data[(fw.weibo_train_data.time<'2014-12-01') & \n", + " (fw.weibo_train_data['time']>='2014-11-01')].forward_count)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 18, + "text": [ + "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)" + ] + } + ], + "prompt_number": 18 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_train_data_forward = fw.weibo_train_data[(fw.weibo_train_data.time<'2014-12-01') & (fw.weibo_train_data['time']>='2014-11-01')].forward_count" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 28 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# The coefficients\n", + "print 'Coefficients: \\n', LR_off_train_forward.coef_\n", + "# The mean square error\n", + "print \"Residual sum of squares: %.2f\" % np.mean((LR_off_train_forward.predict(off_train_data_features) - off_train_data_forward) ** 2)\n", + "# Explained variance score: 1 is perfect prediction\n", + "print 'Variance score: %.2f' % LR_off_train_forward.score(off_train_data_features, off_train_data_forward)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Coefficients: \n", + "[ 1.28186683e+00 2.52042790e+00 -4.22240511e-01 3.00193074e-01\n", + " 3.28058189e-01 -2.13529577e-01 3.27640197e-01 -3.49218136e-03\n", + " -7.51687673e-01 4.60758983e-01 -1.88868901e-01 -5.63483055e-01\n", + " -1.83995649e-01 2.93752622e-01 -1.97645411e-01 -1.79253901e-01\n", + " -2.06172987e-03 1.99173963e+00 -6.27022384e-01 6.15917705e-02\n", + " 2.85340672e-01 -6.74744046e-01 3.22703508e-01 -1.93906428e-01\n", + " -7.66002106e-02 -3.09497938e-01 -3.45029518e-01 1.27872315e-01\n", + " 3.79986469e-01 2.80729951e-01 8.66619194e-01 1.06845509e+00\n", + " -7.46800101e-01 -2.54218115e-01 2.73414376e+00 -1.47288430e-01\n", + " -1.01462568e+00 4.30055528e-01 -2.93930772e-01 3.81736449e-01\n", + " -3.66519904e-01 7.19002162e-01 -1.73325464e-01 3.24875449e-01\n", + " -4.16267259e-01 1.09335789e+00 -5.77318469e-01 -5.27257101e-02\n", + " -9.70816462e-01 -6.26686594e-01 -2.76311183e-01 9.30172159e-01\n", + " 9.38379061e-01 1.63664274e-01 4.09739757e-01 -7.65133900e-01\n", + " 3.31016884e-01 -2.58172161e-01 1.01198797e+00 1.77307286e+00\n", + " 3.61991774e-02 -3.89654803e-01 -1.99074142e-01 -6.28604106e-01\n", + " 9.55516554e-01 -7.35720041e-02 -4.58588853e-01 -2.23462541e-01\n", + " 2.15590694e-01 8.00243970e-01 1.16599814e-01 2.19498519e+00\n", + " -2.66742669e-01 -4.14278221e-01 -1.95668630e-01 -9.36758394e-01\n", + " -4.17932271e-01 -3.16992248e-01 4.58014954e-01 -5.32611220e-01\n", + " -1.73775996e-01 7.05968914e-01 1.49674861e+00 -5.08237071e-01\n", + " -2.97077601e-01 -4.55922806e-01 2.16940611e-01 5.40526987e-02\n", + " -4.94225407e-01 1.04665258e+00 2.38683146e+00 2.94031871e+00\n", + " -9.84394767e-01 2.98732515e-01 -7.10995426e-02 -4.39575815e-01\n", + " -8.15566261e-01 2.66261832e-01 -4.48605915e-01 -2.35131478e-01]\n", + "Residual sum of squares: 4199.03" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "Variance score: 0.00" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n" + ] + } + ], + "prompt_number": 29 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_test_data_features = vectorizer.transform(fw.weibo_train_data[(fw.weibo_train_data['time']<='2014-12-31') & \n", + " (fw.weibo_train_data['time']>='2014-12-01')].context_clean)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 20 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_test_data_features = off_test_data_features.toarray()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 21 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_test_data_forward = fw.weibo_train_data[(fw.weibo_train_data.time<='2014-12-31') & (fw.weibo_train_data['time']>='2014-12-01')].forward_count" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 30 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "LR_off_test_forward = linear_model.LinearRegression()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 31 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "LR_off_test_forward.fit(off_test_data_features,off_test_data_forward)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 32, + "text": [ + "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)" + ] + } + ], + "prompt_number": 32 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# The coefficients\n", + "print 'Coefficients: \\n', LR_off_test_forward.coef_\n", + "# The mean square error\n", + "print \"Residual sum of squares: %.2f\" % np.mean((LR_off_train_forward.predict(off_test_data_features) - off_test_data_forward) ** 2)\n", + "# Explained variance score: 1 is perfect prediction\n", + "print 'Variance score: %.2f' % LR_off_train_forward.score(off_test_data_features, off_test_data_forward)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Coefficients: \n", + "[-0.1171739 0.63265743 -0.57442636 1.08746757 0.20182595 0.33553953\n", + " 0.81983983 -0.0211989 -0.66764881 0.12984936 -0.24389544 0.36226616\n", + " 0.06604472 0.20332698 0.50035616 0.28453848 0.26696837 0.33407466\n", + " -0.51198033 -0.79661462 0.39749733 -0.55538544 1.22117418 -0.22280062\n", + " 0.27924758 0.06368221 -0.07490052 -0.27374777 -0.32021338 -0.67411311\n", + " 0.50933275 0.23655788 -0.45525629 0.0289753 2.67015218 -0.24295205\n", + " -0.72936656 0.14092331 0.13797876 0.08975027 -0.09392222 0.5397419\n", + " -0.28049526 0.09968362 -0.30238392 1.40514351 -0.19270185 0.05052858\n", + " -0.82315137 -0.25508024 -0.44458304 0.84594892 0.21754754 -0.05292221\n", + " -0.67396415 -0.30519182 0.85181616 0.55422181 -0.4006845 1.06301328\n", + " -0.05069149 -0.3123591 -0.09636032 -0.22507588 0.747482 -0.24339028\n", + " -0.63067304 -0.05132197 0.0145011 -0.04456321 0.16510655 1.78731256\n", + " 0.45439284 -0.14257295 -0.45809281 -0.37898258 -0.02587705 -0.21514858\n", + " 0.52580601 -0.38464043 0.23830032 -0.31960683 2.22860638 -0.53478454\n", + " -0.22331493 -0.30997626 0.08812845 -0.2962532 0.06009859 -0.25908015\n", + " 1.85063471 3.14683021 -0.51927151 0.13069531 -0.44594174 0.06274401\n", + " -0.04162402 -0.61135463 -0.6250505 -0.10684694]\n", + "Residual sum of squares: 2902.73" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "Variance score: 0.00" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n" + ] + } + ], + "prompt_number": 33 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_predict_forward = LR_off_train_forward.predict(off_test_data_features)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 35 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_test_data_forward[20:40]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 55, + "text": [ + "73 0\n", + "81 0\n", + "82 0\n", + "119 0\n", + "120 0\n", + "121 0\n", + "122 0\n", + "123 0\n", + "124 0\n", + "183 0\n", + "300 0\n", + "301 0\n", + "302 2\n", + "303 1\n", + "304 0\n", + "305 0\n", + "306 0\n", + "307 0\n", + "308 0\n", + "309 2\n", + "Name: forward_count, dtype: int64" + ] + } + ], + "prompt_number": 55 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "Series(off_predict_forward)[20:40].round()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 58, + "text": [ + "20 1\n", + "21 0\n", + "22 1\n", + "23 0\n", + "24 2\n", + "25 3\n", + "26 -0\n", + "27 -0\n", + "28 -0\n", + "29 2\n", + "30 4\n", + "31 2\n", + "32 6\n", + "33 -6\n", + "34 -3\n", + "35 5\n", + "36 -1\n", + "37 4\n", + "38 3\n", + "39 2\n", + "dtype: float64" + ] + } + ], + "prompt_number": 58 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "off_test_data_forward.count()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 49, + "text": [ + "272702" + ] + } + ], + "prompt_number": 49 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/DataExploration.ipynb b/notebooks/DataExploration.ipynb similarity index 100% rename from DataExploration.ipynb rename to notebooks/DataExploration.ipynb diff --git a/fenci.ipynb b/notebooks/fenci.ipynb similarity index 86% rename from fenci.ipynb rename to notebooks/fenci.ipynb index d9c08ae..398e87d 100644 --- a/fenci.ipynb +++ b/notebooks/fenci.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:19e166f8649d87303d678d94653359d7d52be6a0921728c3551435df422231f4" + "signature": "sha256:70462ecc59f754aafe6fdafca0cc9b261a08bcd8adcbd9fe4302ae319a3de649" }, "nbformat": 3, "nbformat_minor": 0, @@ -5214,7 +5214,7 @@ "language": "python", "metadata": {}, "outputs": [], - "prompt_number": 268 + "prompt_number": 7 }, { "cell_type": "code", @@ -5228,13 +5228,13 @@ { "metadata": {}, "output_type": "pyout", - "prompt_number": 284, + "prompt_number": 11, "text": [ - "" + "" ] } ], - "prompt_number": 284 + "prompt_number": 11 }, { "cell_type": "code", @@ -5245,7 +5245,7 @@ "language": "python", "metadata": {}, "outputs": [], - "prompt_number": 282 + "prompt_number": 1 }, { "cell_type": "code", @@ -5278,7 +5278,7 @@ "language": "python", "metadata": {}, "outputs": [], - "prompt_number": 286 + "prompt_number": 2 }, { "cell_type": "code", @@ -5288,8 +5288,65 @@ ], "language": "python", "metadata": {}, - "outputs": [], - "prompt_number": 288 + "outputs": [ + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "Building prefix dict from /Users/Pro/canopy/lib/python2.7/site-packages/jieba/dict.txt ...\n" + ] + }, + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "DEBUG:jieba:Building prefix dict from /Users/Pro/canopy/lib/python2.7/site-packages/jieba/dict.txt ...\n" + ] + }, + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "Loading model from cache /var/folders/rv/scpt0ywn277bqhjg160ms0d40000gp/T/jieba.cache\n" + ] + }, + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "DEBUG:jieba:Loading model from cache /var/folders/rv/scpt0ywn277bqhjg160ms0d40000gp/T/jieba.cache\n" + ] + }, + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "Loading model cost 0.556 seconds.\n" + ] + }, + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "DEBUG:jieba:Loading model cost 0.556 seconds.\n" + ] + }, + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "Prefix dict has been built succesfully.\n" + ] + }, + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "DEBUG:jieba:Prefix dict has been built succesfully.\n" + ] + } + ], + "prompt_number": 3 }, { "cell_type": "code", @@ -5308,7 +5365,7 @@ ] } ], - "prompt_number": 295 + "prompt_number": 4 }, { "cell_type": "code", @@ -5473,7 +5530,1578 @@ ], "language": "python", "metadata": {}, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "10000/1626750\n", + "20000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "30000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "40000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "50000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "60000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "70000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "80000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "90000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "100000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "110000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "120000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "130000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "140000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "150000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "160000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "170000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "180000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "190000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "200000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "210000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "220000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "230000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "240000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "250000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "260000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "270000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "280000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "290000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "300000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "310000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "320000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "330000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "340000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "350000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "360000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "370000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "380000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "390000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "400000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "410000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "420000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "430000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "440000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "450000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "460000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "470000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "480000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "490000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "500000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "510000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "520000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "530000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "540000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "550000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "560000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "570000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "580000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "590000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "600000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "610000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "620000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "630000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "640000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "650000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "660000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "670000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "680000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "690000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "700000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "710000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "720000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "730000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "740000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "750000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "760000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "770000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "780000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "790000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "800000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "810000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "820000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "830000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "840000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "850000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "860000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "870000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "880000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "890000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "900000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "910000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "920000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "930000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "940000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "950000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "960000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "970000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "980000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "990000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1000000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1010000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1020000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1030000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1040000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1050000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1060000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1070000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1080000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1090000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1100000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1110000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1120000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1130000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1140000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1150000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1160000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1170000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1180000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1190000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1200000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1210000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1220000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1230000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1240000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1250000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1260000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1270000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1280000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1290000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1300000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1310000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1320000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1330000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1340000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1350000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1360000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1370000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1380000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1390000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1400000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1410000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1420000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1430000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1440000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1450000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1460000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1470000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1480000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1490000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1500000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1510000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1520000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1530000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1540000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1550000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1560000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1570000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1580000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1590000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1600000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1610000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "1620000/1626750" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n" + ] + } + ], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "predict_clean_text = fw.cleanText(fw.weibo_predict_data.context)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "10000/275331\n", + "20000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "30000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "40000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "50000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "60000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "70000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "80000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "90000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "100000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "110000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "120000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "130000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "140000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "150000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "160000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "170000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "180000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "190000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "200000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "210000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "220000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "230000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "240000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "250000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "260000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n", + "270000/275331" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n" + ] + } + ], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clean_context.head()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 8, + "text": [ + "0 [\u5410, \u4eac\u4e1c, \u7528\u6237, \u4f53\u9a8c, \u592a\u5dee, ...., \u7b2c\u4e00\u6b21, \u4eac\u4e1c, \u8d2d\u7269, \u4e70, \u51e0\u767e, ...\n", + "1 [\u7231, \u5c0f\u7c73, \u624b\u673a, \u5c0f\u7c73, \u624b\u673a, \u5c4f\u5e55, \u592a\u7ed9\u529b, \u82f1\u5bf8, \u9ad8, \u8272\u5f69, \u9971\u548c\u5ea6, \u590f...\n", + "2 [doge, \u55b5, \u55b5, \u5929\u732b, \u771f\u662f, \u725b, ...., \u5356, \u5b9e\u9a8c, \u8017\u6750, \u8dd1, \u5929\u732b...\n", + "3 [doge, \u627e, \u5de5\u4f5c, \u256e, \u256f, \u25bd, \u2570, \u256d]\n", + "4 [doge, \u6210\u529f, \u62a2, \u53f0\u7c73, \u7535\u4fe1]\n", + "dtype: object" + ] + } + ], + "prompt_number": 8 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clean_context.to_csv('data/train_context_clean.csv')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "predict_clean_text.to_csv('data/predict_context_clean.csv')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 10 }, { "cell_type": "code", diff --git a/weiboPredict/framework.py b/weiboPredict/framework.py index 5bb3731..6abbbfe 100644 --- a/weiboPredict/framework.py +++ b/weiboPredict/framework.py @@ -2,9 +2,13 @@ # Usage: basic operations # Author: Chen Li import pandas as pd +import numpy as np import csv import re import jieba +from sklearn.feature_extraction.text import CountVectorizer +from sklearn import linear_model +from sklearn.externals import joblib weibo_train_data = None weibo_predict_data = None @@ -36,6 +40,7 @@ def cleanText(contexts): stopwords[i] = stopwords[i].decode('utf8') f.close() + i=0 cleans = [] for context in contexts: context = re.sub("http://.*\w$","",context) @@ -46,7 +51,46 @@ def cleanText(contexts): text = jieba.lcut(context) clean = [t for t in text if t not in stopwords] cleans.append(clean) + i=i+1 + if i%10000==0: + print str(i)+'/'+str(len(contexts)) return pd.Series(cleans) +def train(start,end,label,feature_type,model_type): + global weibo_train_data + train_context_clean = Series.from_csv('data/train_context_clean.csv') + weibo_train_data['context_clean'] = train_context_clean + if model_type=="LR": + vectorizer = CountVectorizer(analyzer = "word", \ + tokenizer = None, \ + preprocessor = None, \ + stop_words = None, \ + max_features = 100) + train_features = vectorizer.fit_transform( / + weibo_train_data[(weibo_train_data['time']<=end) / + & (fw.weibo_train_data['time']>=start)].context_clean) + train_features = train_features.toarray() + train_labels = weibo_train_data[(weibo_train_data['time']<=end) / + & (fw.weibo_train_data['time']>=start)][label] + + model = linear_model.LinearRegression() + model.fit(train_features,train_labels) + print '====='+feature_type+'_'+model_type+'=====' + # The coefficients + print 'Coefficients: \n', model.coef_ + # The mean square error + print "Residual sum of squares: %.2f" % / + np.mean((model.predict(train_features) - train_labels) ** 2) + # Explained variance score: 1 is perfect prediction + print 'Variance score: %.2f' % model.score(train_features, train_labels) + + joblib.dump(model,feature_type+'_'+model_type+'_'+start+'_'+end+'.model') + return model + + + + + + if __name__ == "__main__": loadData() \ No newline at end of file