Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed a bug when processing data without socre columns. #2

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions blackbox/getsampledata.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar
#apt-get install unrar-free
unrar MQ2008.rar
rm MQ2008.rar
mv -f MQ2008/Fold1/*.txt .
6 changes: 3 additions & 3 deletions blackbox/listnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,9 @@ def prepare_data(self,filename=None,T=500, dim = 30, train = True, noscore = Tru
self.test_X = []
for t in data.session.unique():
dt = data[data.session == t]
if dt.score.sum() == 0:
continue
xt = dt.loc[:,np.in1d(dt.columns,["session"],invert=True)]
self.test_X.append(xt.values.astype(np.float32))
self.test_X.append(xt.values.astype(np.float32))
self.T = len(self.X)
else:
self.test_X = []
self.test_Y = []
Expand Down Expand Up @@ -171,6 +170,7 @@ def predict(self,test_X):
def test(self,filename,noscore = True):
testres = 0
self.prepare_data(filename = filename, train = False, noscore = noscore)

if noscore:
pred = pd.DataFrame()
for j in range(self.test_T):
Expand Down
263 changes: 263 additions & 0 deletions blackbox/sample.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import sklearn.externals.six\n",
"from sklearn.externals.six.moves import range"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def iter_lines(lines, has_targets=True, one_indexed=True, missing=0.0):\n",
" \"\"\"Transforms an iterator of lines to an iterator of LETOR rows.\n",
" Each row is represented by a (x, y, qid, comment) tuple.\n",
" Parameters\n",
" ----------\n",
" lines : iterable of lines\n",
" Lines to parse.\n",
" has_targets : bool, optional\n",
" Whether the file contains targets. If True, will expect the first token\n",
" of every line to be a real representing the sample's target (i.e.\n",
" score). If False, will use -1 as a placeholder for all targets.\n",
" one_indexed : bool, optional\n",
" Whether feature ids are one-indexed. If True, will subtract 1 from each\n",
" feature id.\n",
" missing : float, optional\n",
" Placeholder to use if a feature value is not provided for a sample.\n",
" Yields\n",
" ------\n",
" x : array of floats\n",
" Feature vector of the sample.\n",
" y : float\n",
" Target value (score) of the sample, or -1 if no target was parsed.\n",
" qid : object\n",
" Query id of the sample. This is currently guaranteed to be a string.\n",
" comment : str\n",
" Comment accompanying the sample.\n",
" \"\"\"\n",
" for line in lines:\n",
" data, _, comment = line.rstrip().partition('#')\n",
" toks = data.split()\n",
"\n",
" num_features = 0\n",
" x = np.repeat(missing, 8)\n",
" y = -1.0\n",
" if has_targets:\n",
" y = float(toks[0])\n",
" toks = toks[1:]\n",
"\n",
" qid = _parse_qid_tok(toks[0])\n",
"\n",
" for tok in toks[1:]:\n",
" fid, _, val = tok.partition(':')\n",
" fid = int(fid)\n",
" val = float(val)\n",
" if one_indexed:\n",
" fid -= 1\n",
" assert fid >= 0\n",
" while len(x) <= fid:\n",
" orig = len(x)\n",
" x.resize(len(x) * 2)\n",
" x[orig:orig * 2] = missing\n",
"\n",
" x[fid] = val\n",
" num_features = max(fid + 1, num_features)\n",
"\n",
" assert num_features > 0\n",
" x.resize(num_features)\n",
"\n",
" yield (x, y, qid, comment)\n",
"\n",
"\n",
"def read_dataset(source, has_targets=True, one_indexed=True, missing=0.0):\n",
" \"\"\"Parses a LETOR dataset from `source`.\n",
" Parameters\n",
" ----------\n",
" source : string or iterable of lines\n",
" String, file, or other file-like object to parse.\n",
" has_targets : bool, optional\n",
" See `iter_lines`.\n",
" one_indexed : bool, optional\n",
" See `iter_lines`.\n",
" missing : float, optional\n",
" See `iter_lines`.\n",
" Returns\n",
" -------\n",
" X : array of arrays of floats\n",
" Feature matrix (see `iter_lines`).\n",
" y : array of floats\n",
" Target vector (see `iter_lines`).\n",
" qids : array of objects\n",
" Query id vector (see `iter_lines`).\n",
" comments : array of strs\n",
" Comment vector (see `iter_lines`).\n",
" \"\"\"\n",
" if isinstance(source, sklearn.externals.six.string_types):\n",
" source = source.splitlines()\n",
"\n",
" max_width = 0\n",
" xs, ys, qids, comments = [], [], [], []\n",
" it = iter_lines(source, has_targets=has_targets,\n",
" one_indexed=one_indexed, missing=missing)\n",
" for x, y, qid, comment in it:\n",
" xs.append(x)\n",
" ys.append(y)\n",
" qids.append(qid)\n",
" comments.append(comment)\n",
" max_width = max(max_width, len(x))\n",
"\n",
" assert max_width > 0\n",
" X = np.ndarray((len(xs), max_width), dtype=np.float64)\n",
" X.fill(missing)\n",
" for i, x in enumerate(xs):\n",
" X[i, :len(x)] = x\n",
" ys = np.array(ys) if has_targets else None\n",
" qids = np.array(qids)\n",
" comments = np.array(comments)\n",
"\n",
" return (X, ys, qids, comments)\n",
"\n",
"\n",
"def _parse_qid_tok(tok):\n",
" assert tok.startswith('qid:')\n",
" return tok[4:]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with open('train.txt') as trainfile, \\\n",
" open('vali.txt') as valifile, \\\n",
" open('test.txt') as evalfile:\n",
" TX, Ty, Tqids, _ = read_dataset(trainfile)\n",
" VX, Vy, Vqids, _ =read_dataset(valifile)\n",
" EX, Ey, Eqids, _ = read_dataset(evalfile)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"traindata =pd.concat([pd.DataFrame(TX) ,pd.DataFrame(Ty,columns=[\"score\"]),pd.DataFrame(Tqids,columns=[\"session\"])],axis=1)\n",
"validata =pd.concat([pd.DataFrame( VX) ,pd.DataFrame(Vy,columns=[\"score\"]),pd.DataFrame(Vqids,columns=[\"session\"])],axis=1)\n",
"testdata =pd.concat([pd.DataFrame(EX) ,pd.DataFrame(Eqids,columns=[\"session\"])],axis=1)\n",
"traindata.to_csv(\"datasets.csv\",index=None)\n",
"validata.to_csv(\"datasets_test.csv\",index=None)\n",
"testdata.to_csv(\"datasets_test_noscore.csv\",index=None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"step:9,train_loss:7.46409893036\n",
"train_ndcg:0.812323686008\n",
"step:19,train_loss:6.07994031906\n",
"train_ndcg:0.909060459757\n",
"step:29,train_loss:7.78203868866\n",
"train_ndcg:0.944716231404\n",
"step:39,train_loss:6.72615432739\n",
"train_ndcg:0.966039979587\n",
"step:49,train_loss:7.4245595932\n",
"train_ndcg:0.977167381069\n",
"step:59,train_loss:9.81043434143\n",
"train_ndcg:0.985105804125\n",
"step:69,train_loss:8.38823699951\n",
"train_ndcg:0.989879857973\n"
]
}
],
"source": [
"!python2.7 listnet.py --train_val_filename \"datasets.csv\" --test_score_filename \"datasets_test.csv\" --test_noscore_filename \"datasets_test_noscore.csv\" --val_ratio 0.0 --max_iter 100\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [py35]",
"language": "python",
"name": "Python [py35]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}