Skip to content

Commit

Permalink
Upload archive of deliverables
Browse files Browse the repository at this point in the history
  • Loading branch information
matejklemen committed Nov 17, 2024
1 parent a037113 commit 2e2f947
Show file tree
Hide file tree
Showing 76 changed files with 6,708 additions and 0 deletions.
12 changes: 12 additions & 0 deletions Benchmarking_SloBENCH/eval_coref149/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM python:3.8.11-bullseye

COPY evaluation_scripts/eval_coref149/scorer ./scorer
COPY evaluation_scripts/eval_coref149/ua-scorer.py .
COPY evaluation_scripts/eval_coref149/evaluate_corefud.py .
COPY evaluation_scripts/eval_coref149/evaluate.py .
COPY evaluation_scripts/eval_coref149/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY run.py .

ENTRYPOINT ["python3", "run.py"]
22 changes: 22 additions & 0 deletions Benchmarking_SloBENCH/eval_coref149/LICENSE_SCORER
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
The MIT License (MIT)

Copyright 2018 Nafise Sadat Moosavi (ns.moosavi at gmail dot com)
Copyright 2021 Juntao Yu (juntao.cn at gmail dot com)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
18 changes: 18 additions & 0 deletions Benchmarking_SloBENCH/eval_coref149/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# eval_senticoref slobench evaluation script

All commands should be run from the root directory of the repository.

## Build docker image (from the root directory of this repo):

```
docker buildx build --platform linux/amd64 -t eval:eval_coref149 -f evaluation_scripts/eval_coref149/Dockerfile .
```

## Run mock evaluation (from the root directory of this repo)

```
docker run -it --name eval-container_coref149 --rm \
-v $PWD/evaluation_scripts/eval_coref149/sample_ground_truth.zip:/ground_truth.zip \
-v $PWD/evaluation_scripts/eval_coref149/sample_submission.zip:/submission.zip \
eval:eval_coref149 ground_truth.zip submission.zip
```
13 changes: 13 additions & 0 deletions Benchmarking_SloBENCH/eval_coref149/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import os

from evaluate_corefud import call_scorer


def evaluate(data_ground_truth_path, data_submission_path):
try:
# The script is just glue code around ufal/corefud-scorer
metrics = call_scorer(os.path.join(".", data_ground_truth_path, "coref149.conllu"),
os.path.join(".", data_submission_path, "submission.conllu"))
return metrics
except Exception as e:
raise Exception(f'Exception in metric calculation: {e}')
53 changes: 53 additions & 0 deletions Benchmarking_SloBENCH/eval_coref149/evaluate_corefud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import importlib

from scorer.corefud.reader import CorefUDReader
from scorer.eval import evaluator
uascorer = importlib.import_module("ua-scorer")


def call_scorer(ref_file, pred_file):
args = {
"key_file": ref_file,
"sys_file": pred_file,
"metrics": ['muc', 'bcub', 'ceafe', 'ceafm', 'blanc', 'lea', 'mor'],
"keep_singletons": False,
"match": "head",
"zero_match_method": "dependent",
"format": "corefud",
"keep_split_antecedents": False,
"keep_zeros": True,
"evaluate_discourse_deixis": False,
"only_split_antecedent": False,
"allow_boundary_crossing": False,
"np_only": False,
"remove_nested_mentions": False,
"shared_task": None
}
uascorer.process_arguments(args)
reader = CorefUDReader(**args)
reader.get_coref_infos(args["key_file"], args["sys_file"])

conll = 0
conll_subparts_num = 0

calculated_metrics = {}
for name, metric in args["metrics"]:
recall, precision, f1 = evaluator.evaluate_documents(
reader.doc_discourse_deixis_infos if args['evaluate_discourse_deixis'] else reader.doc_coref_infos,
metric,
beta=1,
only_split_antecedent=args['only_split_antecedent'])

calculated_metrics[f"Precision({name})"] = precision
calculated_metrics[f"Recall({name})"] = recall
calculated_metrics[f"F1({name})"] = f1

if name in ["muc", "bcub", "ceafe"]:
conll += f1
conll_subparts_num += 1

if conll_subparts_num == 3:
conll = (conll / 3)
calculated_metrics["conll"] = conll

return calculated_metrics
4 changes: 4 additions & 0 deletions Benchmarking_SloBENCH/eval_coref149/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
numpy
scipy>=0.17.0
pytest
udapi>=0.3.0
Binary file not shown.
Binary file not shown.
Empty file.
Empty file.
184 changes: 184 additions & 0 deletions Benchmarking_SloBENCH/eval_coref149/scorer/base/mention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
class Mention:
def __init__(self, matching="exact"):
# here we only include the properties might be used outside the mention class,
# and assign a default value to make sure no error even if fuction not used by
# specific format
self._words = [] # store all word indies
self._wordsset = set()
self._minset = set()
self._is_referring = True # for non-referring
self._is_split_antecedent = False # for split-antecedent
self._split_antecedent_sets = set() # for split-antecedent
self._is_zero = False
# in case of the "head" matching, the two mentions are considered to be the same
# only if their spans as well as their min sets are the same
if matching == "head":
self._eq_match = self._super_exact_match
self._hash_match = self._super_exact_match_hash
# for the remaining matching types, it is sufficient for the spans to be tha same
else:
self._eq_match = self._exact_match
self._hash_match = self._exact_match_hash

############## Properties ###############

@property
def words(self):
return self._words

@property
def start(self):
return self._words[0]

@property
def end(self):
return self._words[-1]

@property
def is_zero(self):
return self._is_zero

@property
def is_referring(self):
return self._is_referring

@property
def is_split_antecedent(self):
return self._is_split_antecedent

@property
def split_antecedent_sets(self):
return self._split_antecedent_sets

############## Operators ###############

def __getitem__(self, i):
return self._words[i]

def __len__(self):
return len(self._words)

def __eq__(self, other):
return self._eq_match(other)

def __ne__(self, other):
return not self.__eq__(other)

def __lt__(self, other):
if isinstance(other, self.__class__):
if self._words[0] == other._words[0]:
if self._words[-1] == other._words[-1]:
return len(self._words) < len(other._words)
else:
return self._words[-1] < other._words[-1]
else:
return self._words[0] < other._words[0]
return NotImplemented

def __le__(self, other):
return self.__lt__(other) or self.__eq__(other)

def __hash__(self):
if self.is_split_antecedent:
return hash(frozenset(self.split_antecedent_sets))
return self._hash_match()

def __str__(self):
if self.is_split_antecedent:
return "({:s})".format(",".join([str(cl[0]) for cl in self.split_antecedent_sets]))
return "({:s})".format(
",".join([str(w) + "*" if self._minset and w in self._minset else str(w) for w in self._words]))

def __repr__(self):
return str(self)

def intersection(self, other):
if isinstance(other, self.__class__):
if self._words[0] > other._words[-1] or \
other._words[0] > self._words[-1]:
return []
return self._wordsset.intersection(other._wordsset)
return NotImplemented

############## Matching types #################

# both mention span and its min set must be matched exactly
def _super_exact_match(self, other):
if not isinstance(other, self.__class__):
return NotImplemented

# for split-antecedent we check all the members are the same
if self.is_split_antecedent or other.is_split_antecedent:
return self.split_antecedent_sets == other.split_antecedent_sets

# check if the mention spans are the same
# TODO rewrite using _wordsset
if len(self._words) != len(other._words):
return False
words_zip = zip(self._words, other._words)
if not all(self_w == other_w for self_w, other_w in words_zip):
return False

# check if the min spans / heads are the same
return self._minset == other._minset

# mention span must be matched
def _exact_match(self, other):
if isinstance(other, self.__class__):
# for split-antecedent we check all the members are the same
if self.is_split_antecedent or other.is_split_antecedent:
return self.split_antecedent_sets == other.split_antecedent_sets
else:
if len(self._words) != len(other._words):
return False
words_zip = zip(self._words, other._words)
return all(self_w == other_w for self_w, other_w in words_zip)

def match_score(self, other, matching):
if not isinstance(other, self.__class__):
return NotImplemented
if matching == "zero-dependent":
return self.zero_dependent_match_score(other)
if matching == "partial-craft":
return self.craft_partial_match_score(other)
if matching == "partial-corefud":
return self.corefud_partial_match_score(other)
if matching == "head":
return self.head_match_score(other)
# exact match
if self.__eq__(other):
return 1.0
return 0.0

# Default (with MIN tag) similar to the CorefUD that allow the response to be part of the key, in the
# sametime the response must include all the words in MIN(head), if the above condition is
# satisfied then a non-zero similarity score based on the proportion of the common words
# (num_of_common_words/total_words_in_key) will be returned otherwise 0 will be returned.
# self = key mention, other = sys mention
def corefud_partial_match_score(self, other):
if self._minset and self._minset.issubset(other._wordsset) and other._wordsset.issubset(
self._wordsset):
return len(self._wordsset & other._wordsset) * 1.0 / len(self._wordsset)
return 0.0

# CRAFT (with craft tag) same as the CRAFT 2019 CR task that use the first key span as the MIN and any
# response that overlapping with the MIN (start>=MIN[0] and end <=MIN[1]) will receive a
# non-zero similarity score otherwise a zero will be returned.
# self = key mention, other = sys mention
def craft_partial_match_score(self, other):
# only support UA format yet
return NotImplemented

# self = key mention, other = sys mention
def head_match_score(self, other):
# only support CorefUD format yet
return NotImplemented

def zero_dependent_match_score(self, other):
return NotImplemented

def _exact_match_hash(self):
return hash(frozenset(self._words))

def _super_exact_match_hash(self):
return hash((frozenset(self._words), frozenset(self._minset)))
Loading

0 comments on commit 2e2f947

Please sign in to comment.