From f98775bd209fdb9b54b6ca914ace2ae9a20fd38f Mon Sep 17 00:00:00 2001 From: mitsuo yamamoto Date: Tue, 22 Nov 2016 15:59:13 +0900 Subject: [PATCH 1/2] Fixed a bug when processing data without socre columns. --- blackbox/listnet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/blackbox/listnet.py b/blackbox/listnet.py index 57f1e5d..856406e 100644 --- a/blackbox/listnet.py +++ b/blackbox/listnet.py @@ -65,10 +65,9 @@ def prepare_data(self,filename=None,T=500, dim = 30, train = True, noscore = Tru self.test_X = [] for t in data.session.unique(): dt = data[data.session == t] - if dt.score.sum() == 0: - continue xt = dt.loc[:,np.in1d(dt.columns,["session"],invert=True)] - self.test_X.append(xt.values.astype(np.float32)) + self.test_X.append(xt.values.astype(np.float32)) + self.T = len(self.X) else: self.test_X = [] self.test_Y = [] @@ -171,6 +170,7 @@ def predict(self,test_X): def test(self,filename,noscore = True): testres = 0 self.prepare_data(filename = filename, train = False, noscore = noscore) + if noscore: pred = pd.DataFrame() for j in range(self.test_T): From a4a3cbf9c16c1a617f47263498b2453c24c91f3e Mon Sep 17 00:00:00 2001 From: mitsuo yamamoto Date: Tue, 22 Nov 2016 16:26:13 +0900 Subject: [PATCH 2/2] Add example. --- blackbox/getsampledata.sh | 6 + blackbox/sample.ipynb | 263 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+) create mode 100644 blackbox/getsampledata.sh create mode 100644 blackbox/sample.ipynb diff --git a/blackbox/getsampledata.sh b/blackbox/getsampledata.sh new file mode 100644 index 0000000..888b7d5 --- /dev/null +++ b/blackbox/getsampledata.sh @@ -0,0 +1,6 @@ +#!/bin/bash +wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar +#apt-get install unrar-free +unrar MQ2008.rar +rm MQ2008.rar +mv -f MQ2008/Fold1/*.txt . diff --git a/blackbox/sample.ipynb b/blackbox/sample.ipynb new file mode 100644 index 0000000..0acbfbf --- /dev/null +++ b/blackbox/sample.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import sklearn.externals.six\n", + "from sklearn.externals.six.moves import range" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def iter_lines(lines, has_targets=True, one_indexed=True, missing=0.0):\n", + " \"\"\"Transforms an iterator of lines to an iterator of LETOR rows.\n", + " Each row is represented by a (x, y, qid, comment) tuple.\n", + " Parameters\n", + " ----------\n", + " lines : iterable of lines\n", + " Lines to parse.\n", + " has_targets : bool, optional\n", + " Whether the file contains targets. If True, will expect the first token\n", + " of every line to be a real representing the sample's target (i.e.\n", + " score). If False, will use -1 as a placeholder for all targets.\n", + " one_indexed : bool, optional\n", + " Whether feature ids are one-indexed. If True, will subtract 1 from each\n", + " feature id.\n", + " missing : float, optional\n", + " Placeholder to use if a feature value is not provided for a sample.\n", + " Yields\n", + " ------\n", + " x : array of floats\n", + " Feature vector of the sample.\n", + " y : float\n", + " Target value (score) of the sample, or -1 if no target was parsed.\n", + " qid : object\n", + " Query id of the sample. This is currently guaranteed to be a string.\n", + " comment : str\n", + " Comment accompanying the sample.\n", + " \"\"\"\n", + " for line in lines:\n", + " data, _, comment = line.rstrip().partition('#')\n", + " toks = data.split()\n", + "\n", + " num_features = 0\n", + " x = np.repeat(missing, 8)\n", + " y = -1.0\n", + " if has_targets:\n", + " y = float(toks[0])\n", + " toks = toks[1:]\n", + "\n", + " qid = _parse_qid_tok(toks[0])\n", + "\n", + " for tok in toks[1:]:\n", + " fid, _, val = tok.partition(':')\n", + " fid = int(fid)\n", + " val = float(val)\n", + " if one_indexed:\n", + " fid -= 1\n", + " assert fid >= 0\n", + " while len(x) <= fid:\n", + " orig = len(x)\n", + " x.resize(len(x) * 2)\n", + " x[orig:orig * 2] = missing\n", + "\n", + " x[fid] = val\n", + " num_features = max(fid + 1, num_features)\n", + "\n", + " assert num_features > 0\n", + " x.resize(num_features)\n", + "\n", + " yield (x, y, qid, comment)\n", + "\n", + "\n", + "def read_dataset(source, has_targets=True, one_indexed=True, missing=0.0):\n", + " \"\"\"Parses a LETOR dataset from `source`.\n", + " Parameters\n", + " ----------\n", + " source : string or iterable of lines\n", + " String, file, or other file-like object to parse.\n", + " has_targets : bool, optional\n", + " See `iter_lines`.\n", + " one_indexed : bool, optional\n", + " See `iter_lines`.\n", + " missing : float, optional\n", + " See `iter_lines`.\n", + " Returns\n", + " -------\n", + " X : array of arrays of floats\n", + " Feature matrix (see `iter_lines`).\n", + " y : array of floats\n", + " Target vector (see `iter_lines`).\n", + " qids : array of objects\n", + " Query id vector (see `iter_lines`).\n", + " comments : array of strs\n", + " Comment vector (see `iter_lines`).\n", + " \"\"\"\n", + " if isinstance(source, sklearn.externals.six.string_types):\n", + " source = source.splitlines()\n", + "\n", + " max_width = 0\n", + " xs, ys, qids, comments = [], [], [], []\n", + " it = iter_lines(source, has_targets=has_targets,\n", + " one_indexed=one_indexed, missing=missing)\n", + " for x, y, qid, comment in it:\n", + " xs.append(x)\n", + " ys.append(y)\n", + " qids.append(qid)\n", + " comments.append(comment)\n", + " max_width = max(max_width, len(x))\n", + "\n", + " assert max_width > 0\n", + " X = np.ndarray((len(xs), max_width), dtype=np.float64)\n", + " X.fill(missing)\n", + " for i, x in enumerate(xs):\n", + " X[i, :len(x)] = x\n", + " ys = np.array(ys) if has_targets else None\n", + " qids = np.array(qids)\n", + " comments = np.array(comments)\n", + "\n", + " return (X, ys, qids, comments)\n", + "\n", + "\n", + "def _parse_qid_tok(tok):\n", + " assert tok.startswith('qid:')\n", + " return tok[4:]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "with open('train.txt') as trainfile, \\\n", + " open('vali.txt') as valifile, \\\n", + " open('test.txt') as evalfile:\n", + " TX, Ty, Tqids, _ = read_dataset(trainfile)\n", + " VX, Vy, Vqids, _ =read_dataset(valifile)\n", + " EX, Ey, Eqids, _ = read_dataset(evalfile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "traindata =pd.concat([pd.DataFrame(TX) ,pd.DataFrame(Ty,columns=[\"score\"]),pd.DataFrame(Tqids,columns=[\"session\"])],axis=1)\n", + "validata =pd.concat([pd.DataFrame( VX) ,pd.DataFrame(Vy,columns=[\"score\"]),pd.DataFrame(Vqids,columns=[\"session\"])],axis=1)\n", + "testdata =pd.concat([pd.DataFrame(EX) ,pd.DataFrame(Eqids,columns=[\"session\"])],axis=1)\n", + "traindata.to_csv(\"datasets.csv\",index=None)\n", + "validata.to_csv(\"datasets_test.csv\",index=None)\n", + "testdata.to_csv(\"datasets_test_noscore.csv\",index=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "step:9,train_loss:7.46409893036\n", + "train_ndcg:0.812323686008\n", + "step:19,train_loss:6.07994031906\n", + "train_ndcg:0.909060459757\n", + "step:29,train_loss:7.78203868866\n", + "train_ndcg:0.944716231404\n", + "step:39,train_loss:6.72615432739\n", + "train_ndcg:0.966039979587\n", + "step:49,train_loss:7.4245595932\n", + "train_ndcg:0.977167381069\n", + "step:59,train_loss:9.81043434143\n", + "train_ndcg:0.985105804125\n", + "step:69,train_loss:8.38823699951\n", + "train_ndcg:0.989879857973\n" + ] + } + ], + "source": [ + "!python2.7 listnet.py --train_val_filename \"datasets.csv\" --test_score_filename \"datasets_test.csv\" --test_noscore_filename \"datasets_test_noscore.csv\" --val_ratio 0.0 --max_iter 100\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [py35]", + "language": "python", + "name": "Python [py35]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}