-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataloader.py
119 lines (97 loc) · 3.84 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("scenario", help="Possible arguments: s_Croatian, ns_Croatian, s_Serbian, ns_Serbian")
args = parser.parse_args()
# Define the scenario
scenario = args.scenario
def extract_ner_dataset(scenario):
"""
Extract a NER dataset that can be used for NER evaluation with simple transformers.
Args:
- scenario: s_Croatian, ns_Croatian, s_Serbian, ns_Serbian
"""
from conllu import parse
import pandas as pd
import numpy as np
import json
import random
datasets = {
"s_Croatian": {
"name": "Croatian linguistic training corpus hr500k 2.0",
"path":"https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1792/hr500k.conllup",
"dataset": ["hr500k.conllup"]},
"ns_Croatian": {
"name": "Croatian Twitter training corpus ReLDI-NormTagNER-hr 3.0",
"path":"https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1793/reldi-normtagner-hr.conllup",
"dataset": ["reldi-normtagner-hr.conllup"]},
"s_Serbian": {
"name": "Serbian linguistic training corpus SETimes.SR 2.0",
"path":"https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1843/set.sr.plus.conllup",
"dataset": ["set.sr.plus.conllup"]},
"ns_Serbian": {
"name": "Serbian Twitter training corpus ReLDI-NormTagNER-sr 3.0",
"path":"https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1794/reldi-normtagner-sr.conllup",
"dataset": ["reldi-normtagner-sr.conllup"]}
}
# Loop through all the datasets if there are multiple datasets for one scenario
for i in range(len(datasets[scenario]["dataset"])):
dataset = datasets[scenario]["dataset"][i]
doc = "datasets/{}".format(dataset)
# Open the dataset
data = open("{}".format(doc), "r").read()
# Parse conllu file
sentences = parse(data)
word_list = []
sent_id_list = []
NER_list = []
split_list = []
# Code for Serbian and Croatian corpora
# Collect all important information from the dataset
for sentence in sentences:
current_sent_id = sentence.metadata["sent_id"]
if sentence.metadata.get("contained_in_datasets", None) != None:
current_dataset = sentence.metadata["contained_in_datasets"]
if "train" in current_dataset:
current_split = "train"
elif "dev" in current_dataset:
current_split = "dev"
elif "test" in current_dataset:
current_split = "test"
for token in sentence:
current_word = token["form"]
current_ner = token["reldi:ne"]
word_list.append(current_word)
sent_id_list.append(current_sent_id)
NER_list.append(current_ner)
split_list.append(current_split)
# Create a dictionary for all words and all needed information
data_dict = {"sentence_id": sent_id_list, "words": word_list, "labels": NER_list, "split": split_list}
# Create a pandas df out of the dictionary
df = pd.DataFrame(data_dict)
LABELS = list(df.labels.unique())
# If * is used, change * to O, because this causes errors
if "*" in LABELS:
LABELS[LABELS.index("*")] = "O"
df["labels"] = np.where(df["labels"] == "*", "O", df["labels"])
# Show the df
print(df.head())
print("\n")
print(df.describe(include="all"))
print("\n")
print(df.split.value_counts(normalize=True))
print("\n")
print(df.labels.value_counts(normalize=True))
print("\n")
# Save the information in a format that will be used by simpletransformers
json_dict = {
"labels": LABELS,
"train": df[df["split"] == "train"].drop(columns="split").to_dict(),
"dev": df[df["split"] == "dev"].drop(columns="split").to_dict(),
"test": df[df["split"] == "test"].drop(columns="split").to_dict()
}
# Save json as file
with open("datasets/{}_extracted.json".format(dataset), "w") as end_file:
json.dump(json_dict, end_file, indent=2)
print("\n\nExtracted dataset saved as datasets/{}_extracted.json".format(dataset))
extract_ner_dataset(scenario)