From bd15d5ea746ce14ee0e24304868c23aca7ec9b5a Mon Sep 17 00:00:00 2001 From: themistoklik Date: Tue, 15 Mar 2016 01:42:08 +0200 Subject: [PATCH] added nbviewer,fixed link --- .../Visual data exploration-checkpoint.ipynb | 249 ++++++++++++++++++ .../source/Visual data exploration.ipynb | 21 +- 2 files changed, 259 insertions(+), 11 deletions(-) create mode 100644 Homework2/source/.ipynb_checkpoints/Visual data exploration-checkpoint.ipynb diff --git a/Homework2/source/.ipynb_checkpoints/Visual data exploration-checkpoint.ipynb b/Homework2/source/.ipynb_checkpoints/Visual data exploration-checkpoint.ipynb new file mode 100644 index 0000000..7512257 --- /dev/null +++ b/Homework2/source/.ipynb_checkpoints/Visual data exploration-checkpoint.ipynb @@ -0,0 +1,249 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Interactive visualization of topics in our dataset\n", + "\n", + "Below is an ipython notebook presenting the results of LDA application in our already preprocessed dataset.This is for raw BoW representation, could add tf-idf in the future.\n", + "\n", + "First load the data and drop unneeded columns." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# -*- coding: utf-8 -*-\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import gensim.models.ldamulticore as lda\n", + "from gensim import matutils as mu\n", + "import pyLDAvis as pld\n", + "import pyLDAvis.gensim as gensimvis\n", + "import gensim.corpora.dictionary as gensimdict\n", + "filename='data/dataset.txt'\n", + "X = pd.read_csv(filename, ';')\n", + "X = X.drop('category', axis=1)\n", + "X = X.drop('project', axis=1)\n", + "vocab=list(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional preprocessing\n", + "\n", + "We don't need terms that appear too much or too little in our dataset as they tend to either dominate the topics or be just noise." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#get rid of less frequent terms\n", + "droplist= []\n", + "for item in vocab:\n", + " total = X[item].sum()\n", + " if total <= 20 or total >=500:\n", + " droplist.append(item)\n", + "X.drop(droplist,axis=1,inplace=True,errors='ignore')\n", + "\n", + "vocab = list(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LDA\n", + "\n", + "We use gensim's LDA module in its parallel flavor. id2word_dict is a dictionary that maps numeric IDs to words, and is needed by LDA to produce readable output. The " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0,\n", + " '0.004*jakewharton + 0.004*rxbinding + 0.004*assertBothWays + 0.004*acceptance + 0.003*Observable + 0.003*jsonpath + 0.003*jayway + 0.003*JUnitCore + 0.003*HierarchicalStreamReader + 0.003*addOption'),\n", + " (1,\n", + " '0.004*peer + 0.004*Buffer + 0.003*LOGGER + 0.003*CSVFormat + 0.003*AbstractApplication + 0.003*XMLUnit + 0.003*readAscii + 0.003*XMLStreamReader + 0.003*ComparisonResult + 0.002*TLS'),\n", + " (2,\n", + " '0.003*PropertyChangeListener + 0.003*JButton + 0.003*JPanel + 0.003*SwingUtilities + 0.003*firePropertyChange + 0.003*JXTree + 0.003*defaults + 0.003*Insets + 0.003*AbstractAction + 0.003*JFrame'),\n", + " (3,\n", + " '0.003*AccessToken + 0.003*LatLng + 0.003*putString + 0.003*gms + 0.003*amazon + 0.003*Utility + 0.003*EXTRA + 0.003*ResponseData + 0.003*BeansManager + 0.002*geo'),\n", + " (4,\n", + " '0.006*mvp + 0.004*jcommander + 0.004*beust + 0.004*DocType + 0.003*presenter + 0.003*Parameter + 0.003*Mail + 0.003*Verifier + 0.003*AtomicReference + 0.003*fstack'),\n", + " (5,\n", + " '0.004*SuperCsvTestUtils + 0.004*PREFERENCE + 0.004*entypo + 0.004*outline + 0.004*PROCESSORS + 0.004*PREFS + 0.003*CUSTOMERS + 0.003*typcn + 0.003*CsvSchema + 0.003*checkPreconditions'),\n", + " (6,\n", + " '0.007*DefaultJSONParser + 0.006*SerializeConfig + 0.006*setId + 0.005*fieldInfo + 0.005*asm + 0.005*argsj + 0.005*DefaultExtJSONParser + 0.005*kohsuke + 0.005*ALOAD + 0.005*var'),\n", + " (7,\n", + " '0.004*PRODUCER + 0.004*DataSource + 0.004*mProducerListener + 0.003*mGestureDetector + 0.003*PooledByteBuffer + 0.003*VisibleForTesting + 0.003*Consumer + 0.003*mFadeDrawable + 0.003*drawee + 0.003*FLog'),\n", + " (8,\n", + " '0.006*TestNG + 0.005*Handle + 0.005*Something + 0.005*accepts + 0.005*joptsimple + 0.004*tla + 0.004*xsb + 0.004*dbi + 0.004*sqlobject + 0.004*OptionSet'),\n", + " (9,\n", + " '0.004*BasicHttpRequest + 0.004*HttpCacheEntry + 0.004*XmlCharTypes + 0.004*lexicalscope + 0.004*outputPtr + 0.003*currToken + 0.003*jewel + 0.003*originResponse + 0.003*BasicHeader + 0.003*SETBITS')]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#dictionary that maps index to word.We'll need it later for LDA\n", + "id2word_dict = {k:v for k,v in enumerate(vocab)}\n", + "#transpose to get correct dimensionality\n", + "corpus = mu.Dense2Corpus(X.as_matrix().T)\n", + "#below is how it was trained, for now we'll used the saved version of it. Much like a cooking show!\n", + "#model = lda.LdaMulticore(corpus,num_topics=10, id2word=id2word_dict,workers=3,iterations=1000,passes=3)\n", + "model.load('data/lda')\n", + "model.show_topics(num_words=10,formatted = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare a better visualization\n", + "\n", + "We're gonna use the pyLDAvis module to produce an interactive plot of our data. The people that built the module were kind enough to provide helpers to smoothen the process with models trained with the gensim toolkit. What's needed is the model, the corpus and a gensim dictionary which we \"generate\" a posteriori.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#reverse the dict to match pyLDAvis requirements\n", + "id2word_dict_rev = {v:k for k,v in id2word_dict.items()}\n", + "visdict = gensimdict.Dictionary()\n", + "visdict.token2id = id2word_dict_rev\n", + "visdata = gensimvis.prepare(model,corpus,visdict)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "check http://nbviewer.jupyter.org/github/mentekid/PatternRecognition/blob/master/Homework2/source/Visual%20data%20exploration.ipynb# " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#voila \n", + "pld.display(visdata)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Homework2/source/Visual data exploration.ipynb b/Homework2/source/Visual data exploration.ipynb index c9f56af..7512257 100644 --- a/Homework2/source/Visual data exploration.ipynb +++ b/Homework2/source/Visual data exploration.ipynb @@ -150,11 +150,19 @@ "visdata = gensimvis.prepare(model,corpus,visdict)\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "check http://nbviewer.jupyter.org/github/mentekid/PatternRecognition/blob/master/Homework2/source/Visual%20data%20exploration.ipynb# " + ] + }, { "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "scrolled": true }, "outputs": [ { @@ -212,18 +220,9 @@ } ], "source": [ - "#voila check http://nbviewer.jupyter.org/github/mentekid/PatternRecognition/blob/master/Homework2/source/Visual%20data%20exploration.ipynb# \n", + "#voila \n", "pld.display(visdata)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": {