Upload archive of deliverables

clarinsi · Nov 17, 2024 · 2e2f947 · 2e2f947
1 parent a037113
commit 2e2f947
Show file tree

Hide file tree

Showing 76 changed files with 6,708 additions and 0 deletions.
diff --git a/Benchmarking_SloBENCH/eval_coref149/Dockerfile b/Benchmarking_SloBENCH/eval_coref149/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.8.11-bullseye
+
+COPY evaluation_scripts/eval_coref149/scorer ./scorer
+COPY evaluation_scripts/eval_coref149/ua-scorer.py .
+COPY evaluation_scripts/eval_coref149/evaluate_corefud.py .
+COPY evaluation_scripts/eval_coref149/evaluate.py .
+COPY evaluation_scripts/eval_coref149/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY run.py .
+
+ENTRYPOINT ["python3", "run.py"]
diff --git a/Benchmarking_SloBENCH/eval_coref149/LICENSE_SCORER b/Benchmarking_SloBENCH/eval_coref149/LICENSE_SCORER
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright 2018 Nafise Sadat Moosavi (ns.moosavi at gmail dot com)
+Copyright 2021 Juntao Yu (juntao.cn at gmail dot com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Benchmarking_SloBENCH/eval_coref149/README.md b/Benchmarking_SloBENCH/eval_coref149/README.md
@@ -0,0 +1,18 @@
+# eval_senticoref slobench evaluation script
+
+All commands should be run from the root directory of the repository.
+
+## Build docker image (from the root directory of this repo):
+
+```
+docker buildx build --platform linux/amd64 -t eval:eval_coref149 -f evaluation_scripts/eval_coref149/Dockerfile .
+```
+
+## Run mock evaluation (from the root directory of this repo)
+
+```
+docker run -it --name eval-container_coref149 --rm \
+-v $PWD/evaluation_scripts/eval_coref149/sample_ground_truth.zip:/ground_truth.zip \
+-v $PWD/evaluation_scripts/eval_coref149/sample_submission.zip:/submission.zip \
+eval:eval_coref149 ground_truth.zip submission.zip
+```
diff --git a/Benchmarking_SloBENCH/eval_coref149/evaluate.py b/Benchmarking_SloBENCH/eval_coref149/evaluate.py
@@ -0,0 +1,13 @@
+import os
+
+from evaluate_corefud import call_scorer
+
+
+def evaluate(data_ground_truth_path, data_submission_path):
+	try:
+		# The script is just glue code around ufal/corefud-scorer
+		metrics = call_scorer(os.path.join(".", data_ground_truth_path, "coref149.conllu"),
+							  os.path.join(".", data_submission_path, "submission.conllu"))
+		return metrics
+	except Exception as e:
+		raise Exception(f'Exception in metric calculation: {e}')
diff --git a/Benchmarking_SloBENCH/eval_coref149/evaluate_corefud.py b/Benchmarking_SloBENCH/eval_coref149/evaluate_corefud.py
@@ -0,0 +1,53 @@
+import importlib
+
+from scorer.corefud.reader import CorefUDReader
+from scorer.eval import evaluator
+uascorer = importlib.import_module("ua-scorer")
+
+
+def call_scorer(ref_file, pred_file):
+	args = {
+		"key_file": ref_file,
+		"sys_file": pred_file,
+		"metrics": ['muc', 'bcub', 'ceafe', 'ceafm', 'blanc', 'lea', 'mor'],
+		"keep_singletons": False,
+		"match": "head",
+		"zero_match_method": "dependent",
+		"format": "corefud",
+		"keep_split_antecedents": False,
+		"keep_zeros": True,
+		"evaluate_discourse_deixis": False,
+		"only_split_antecedent": False,
+		"allow_boundary_crossing": False,
+		"np_only": False,
+		"remove_nested_mentions": False,
+		"shared_task": None
+	}
+	uascorer.process_arguments(args)
+	reader = CorefUDReader(**args)
+	reader.get_coref_infos(args["key_file"], args["sys_file"])
+
+	conll = 0
+	conll_subparts_num = 0
+
+	calculated_metrics = {}
+	for name, metric in args["metrics"]:
+		recall, precision, f1 = evaluator.evaluate_documents(
+			reader.doc_discourse_deixis_infos if args['evaluate_discourse_deixis'] else reader.doc_coref_infos,
+			metric,
+			beta=1,
+			only_split_antecedent=args['only_split_antecedent'])
+
+		calculated_metrics[f"Precision({name})"] = precision
+		calculated_metrics[f"Recall({name})"] = recall
+		calculated_metrics[f"F1({name})"] = f1
+
+		if name in ["muc", "bcub", "ceafe"]:
+			conll += f1
+			conll_subparts_num += 1
+
+	if conll_subparts_num == 3:
+		conll = (conll / 3)
+		calculated_metrics["conll"] = conll
+
+	return calculated_metrics
diff --git a/Benchmarking_SloBENCH/eval_coref149/requirements.txt b/Benchmarking_SloBENCH/eval_coref149/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+scipy>=0.17.0
+pytest
+udapi>=0.3.0
diff --git a/Benchmarking_SloBENCH/eval_coref149/sample_ground_truth.zip b/Benchmarking_SloBENCH/eval_coref149/sample_ground_truth.zip
diff --git a/Benchmarking_SloBENCH/eval_coref149/sample_submission.zip b/Benchmarking_SloBENCH/eval_coref149/sample_submission.zip
diff --git a/Benchmarking_SloBENCH/eval_coref149/scorer/__init__.py b/Benchmarking_SloBENCH/eval_coref149/scorer/__init__.py
diff --git a/Benchmarking_SloBENCH/eval_coref149/scorer/base/__init__.py b/Benchmarking_SloBENCH/eval_coref149/scorer/base/__init__.py
diff --git a/Benchmarking_SloBENCH/eval_coref149/scorer/base/mention.py b/Benchmarking_SloBENCH/eval_coref149/scorer/base/mention.py
@@ -0,0 +1,184 @@
+class Mention:
+    def __init__(self, matching="exact"):
+        # here we only include the properties might be used outside the mention class,
+        # and assign a default value to make sure no error even if fuction not used by
+        # specific format
+        self._words = []  # store all word indies
+        self._wordsset = set()
+        self._minset = set()
+        self._is_referring = True  # for non-referring
+        self._is_split_antecedent = False  # for split-antecedent
+        self._split_antecedent_sets = set()  # for split-antecedent
+        self._is_zero = False
+        # in case of the "head" matching, the two mentions are considered to be the same
+        # only if their spans as well as their min sets are the same
+        if matching == "head":
+            self._eq_match = self._super_exact_match
+            self._hash_match = self._super_exact_match_hash
+        # for the remaining matching types, it is sufficient for the spans to be tha same
+        else:
+            self._eq_match = self._exact_match
+            self._hash_match = self._exact_match_hash
+
+    ############## Properties ###############
+
+    @property
+    def words(self):
+        return self._words
+
+    @property
+    def start(self):
+        return self._words[0]
+
+    @property
+    def end(self):
+        return self._words[-1]
+
+    @property
+    def is_zero(self):
+        return self._is_zero
+
+    @property
+    def is_referring(self):
+        return self._is_referring
+
+    @property
+    def is_split_antecedent(self):
+        return self._is_split_antecedent
+
+    @property
+    def split_antecedent_sets(self):
+        return self._split_antecedent_sets
+
+    ############## Operators ###############
+
+    def __getitem__(self, i):
+        return self._words[i]
+
+    def __len__(self):
+        return len(self._words)
+
+    def __eq__(self, other):
+        return self._eq_match(other)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __lt__(self, other):
+        if isinstance(other, self.__class__):
+            if self._words[0] == other._words[0]:
+                if self._words[-1] == other._words[-1]:
+                    return len(self._words) < len(other._words)
+                else:
+                    return self._words[-1] < other._words[-1]
+            else:
+                return self._words[0] < other._words[0]
+        return NotImplemented
+
+    def __le__(self, other):
+        return self.__lt__(other) or self.__eq__(other)
+
+    def __hash__(self):
+        if self.is_split_antecedent:
+            return hash(frozenset(self.split_antecedent_sets))
+        return self._hash_match()
+
+    def __str__(self):
+        if self.is_split_antecedent:
+            return "({:s})".format(",".join([str(cl[0]) for cl in self.split_antecedent_sets]))
+        return "({:s})".format(
+            ",".join([str(w) + "*" if self._minset and w in self._minset else str(w) for w in self._words]))
+
+    def __repr__(self):
+        return str(self)
+
+    def intersection(self, other):
+        if isinstance(other, self.__class__):
+            if self._words[0] > other._words[-1] or \
+                other._words[0] > self._words[-1]:
+                return []
+            return self._wordsset.intersection(other._wordsset)
+        return NotImplemented
+
+    ############## Matching types #################
+
+    # both mention span and its min set must be matched exactly
+    def _super_exact_match(self, other):
+        if not isinstance(other, self.__class__):
+            return NotImplemented
+
+        # for split-antecedent we check all the members are the same
+        if self.is_split_antecedent or other.is_split_antecedent:
+            return self.split_antecedent_sets == other.split_antecedent_sets
+
+        # check if the mention spans are the same
+        # TODO rewrite using _wordsset
+        if len(self._words) != len(other._words):
+            return False
+        words_zip = zip(self._words, other._words)
+        if not all(self_w == other_w for self_w, other_w in words_zip):
+            return False
+
+        # check if the min spans / heads are the same
+        return self._minset == other._minset
+
+    # mention span must be matched
+    def _exact_match(self, other):
+        if isinstance(other, self.__class__):
+            # for split-antecedent we check all the members are the same
+            if self.is_split_antecedent or other.is_split_antecedent:
+                return self.split_antecedent_sets == other.split_antecedent_sets
+            else:
+                if len(self._words) != len(other._words):
+                    return False
+                words_zip = zip(self._words, other._words)
+                return all(self_w == other_w for self_w, other_w in words_zip)
+
+    def match_score(self, other, matching):
+        if not isinstance(other, self.__class__):
+            return NotImplemented
+        if matching == "zero-dependent":
+            return self.zero_dependent_match_score(other)
+        if matching == "partial-craft":
+            return self.craft_partial_match_score(other)
+        if matching == "partial-corefud":
+            return self.corefud_partial_match_score(other)
+        if matching == "head":
+            return self.head_match_score(other)
+        # exact match
+        if self.__eq__(other):
+            return 1.0
+        return 0.0
+
+    # Default (with MIN tag) similar to the CorefUD that allow the response to be part of the key, in the
+    #             sametime the response must include all the words in MIN(head), if the above condition is
+    #             satisfied then a non-zero similarity score based on the proportion of the common words
+    #             (num_of_common_words/total_words_in_key) will be returned otherwise 0 will be returned.
+    # self = key mention, other = sys mention
+    def corefud_partial_match_score(self, other):
+        if self._minset and self._minset.issubset(other._wordsset) and other._wordsset.issubset(
+            self._wordsset):
+            return len(self._wordsset & other._wordsset) * 1.0 / len(self._wordsset)
+        return 0.0
+
+    # CRAFT (with craft tag) same as the CRAFT 2019 CR task that use the first key span as the MIN and any
+    #             response that overlapping with the MIN (start>=MIN[0] and end <=MIN[1]) will receive a
+    #             non-zero similarity score otherwise a zero will be returned.
+    # self = key mention, other = sys mention
+    def craft_partial_match_score(self, other):
+        # only support UA format yet
+        return NotImplemented
+
+    # self = key mention, other = sys mention
+    def head_match_score(self, other):
+        # only support CorefUD format yet
+        return NotImplemented
+
+    def zero_dependent_match_score(self, other):
+        return NotImplemented
+
+    def _exact_match_hash(self):
+        return hash(frozenset(self._words))
+
+    def _super_exact_match_hash(self):
+        return hash((frozenset(self._words), frozenset(self._minset)))