-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata.py
92 lines (70 loc) · 2.88 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from itertools import izip, imap, chain
from collections import defaultdict
import operator
import numpy as np
import io
from sklearn.externals import joblib
import os
DATA_ROOT = os.getenv("HOME") + '/data/'
JUDGEMENT_FILE = DATA_ROOT + 'all_judgements.txt'
FULLTEXT_FOLDER = DATA_ROOT + 'url-header-html-txt'
GROUND_TRUTH_FILE = DATA_ROOT + 'task1_unlabeled_g_truth_cons_public.csv'
class JudgementRecord(object):
def __init__(self, table_row):
attributes = table_row.split('\t')
team_id, worker_id, _, topic_id, doc_id, _, relevance, _, _, _, label_type = attributes
self.team_id = team_id
self.worker_id = worker_id
self.label_type = int(label_type)
self.topic_id = topic_id
self.doc_id = doc_id
if not relevance=='na':
self.is_relevant = (float(relevance) >= 0.5)
else:
self.is_relevant = None
# First read off ground truth so we can look it up afterwards
truth_by_topic_id_and_doc_id = {}
with io.open(GROUND_TRUTH_FILE, 'r', encoding='utf-8') as f:
headers = f.readline()
for line in f:
cols = line[:-1].split("\t")
topic_id = cols[1]
doc_id = cols[2]
truth = float(cols[3]) / 2
if truth < 0:
truth = None
else:
truth = bool(truth)
truth_by_topic_id_and_doc_id[(topic_id, doc_id)] = truth
with io.open(JUDGEMENT_FILE, 'r', encoding='utf-8') as f:
judgement_records = [JudgementRecord(line[:-1]) for line in f]
is_useful = lambda j: (j.label_type == 0) and (j.is_relevant is not None)
judgements = filter(is_useful, judgement_records)
# Grouping judgements by topic
judgements_by_topic_id = defaultdict(list)
for judgement in judgements:
judgements_by_topic_id[judgement.topic_id].append(judgement)
texts_vote_lists_truths_by_topic_id = {}
for topic_id, judgements in judgements_by_topic_id.iteritems():
# Grouping judgements by document inside a topic
judgements_by_doc_id = defaultdict(list)
for judgement in judgements:
judgements_by_doc_id[judgement.doc_id].append(judgement)
# We now have a particular indexing of documents: judgements_by_doc_id.keys()
# Get full texts of documents
document_texts = []
for doc_id in judgements_by_doc_id.keys():
# this doc is in <topic_id>/<doc_id>.txt file
filename = "%s/%s/%s.txt" % (FULLTEXT_FOLDER, topic_id, doc_id)
with io.open(filename, 'r', encoding='utf-8') as f:
document_texts.append(f.read())
vote_lists = [ [j.is_relevant for j in judgements_by_doc_id[doc_id]]
for doc_id in judgements_by_doc_id.keys() ]
# Gettings ground truth for all the documents
truths = [truth_by_topic_id_and_doc_id[(topic_id, doc_id)] \
for doc_id in judgements_by_doc_id.keys()]
texts_vote_lists_truths_by_topic_id[topic_id] = (document_texts, vote_lists, truths)
if __name__ == "__main__":
# Print a list of topic ids
topic_id_list = '(' + ' '.join(texts_vote_lists_truths_by_topic_id.keys()) + ')'
print topic_id_list