-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus.py
59 lines (43 loc) · 1.63 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from nltk.corpus import wordnet as wn
from bs4 import BeautifulSoup, NavigableString
from typing import Dict
RAW_DATA = "SemEval-2013-Task-13-test-data/contexts/senseval2-format/semeval-2013-task-13-test-data.senseval2.xml"
# POS tagging from SemEval to WordNet
pos_to_wnpos = {
'v': wn.VERB,
'n': wn.NOUN,
'j': wn.ADJ
}
class Lexelt:
def __init__(self, lemma_pos: str):
self.lemma, pos = lemma_pos.split('.')
self.pos = pos_to_wnpos[pos]
self.instances = []
self.max_sentence_len = 0
def addInstance(self, token: str, num: str, context: str):
self.instances.append({"id": num, "token": token, "context": context})
self.max_sentence_len = max(len(context), self.max_sentence_len)
def loadSenseval2Format(filename: str = RAW_DATA) -> Dict[str, Lexelt]:
Dataset = {}
with open(filename, 'r') as text_raw:
raw_xml = text_raw.read()
xml = BeautifulSoup(raw_xml, features="html.parser")
for lexelt in xml.corpus.findAll("lexelt"):
item = lexelt['item']
currentLexelt = Lexelt(item)
for instance in lexelt.findAll("instance"):
num = instance["id"].split('.')[-1]
token = instance.head.text
context = instance.context.text
currentLexelt.addInstance(token, num, context)
Dataset[item] = currentLexelt
return Dataset
def main():
# Test parsing
Dataset = loadSenseval2Format()
print(Dataset["become.v"].lemma)
print(Dataset["become.v"].pos)
print(Dataset["become.v"].instances[0])
print(Dataset["become.v"].max_sentence_len)
if __name__ == "__main__":
main()