Skip to content

Commit

Permalink
Add the ability to score the tokenizer training set as a test set - m…
Browse files Browse the repository at this point in the history
…ostly useful to confirm whether it is learning the given dataset or not
  • Loading branch information
AngledLuffa committed Nov 25, 2024
1 parent 97cc056 commit a877a3c
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
2 changes: 2 additions & 0 deletions stanza/utils/training/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class Mode(Enum):
TRAIN = 1
SCORE_DEV = 2
SCORE_TEST = 3
SCORE_TRAIN = 4

class ArgumentParserWithExtraHelp(argparse.ArgumentParser):
def __init__(self, sub_argparse, *args, **kwargs):
Expand Down Expand Up @@ -56,6 +57,7 @@ def build_argparse(sub_argparse=None):
parser.add_argument('--train', dest='mode', default=Mode.TRAIN, action='store_const', const=Mode.TRAIN, help='Run in train mode')
parser.add_argument('--score_dev', dest='mode', action='store_const', const=Mode.SCORE_DEV, help='Score the dev set')
parser.add_argument('--score_test', dest='mode', action='store_const', const=Mode.SCORE_TEST, help='Score the test set')
parser.add_argument('--score_train', dest='mode', action='store_const', const=Mode.SCORE_TRAIN, help='Score the train set as a test set. Currently only implemented for some models')

# These arguments need to be here so we can identify if the model already exists in the user-specified home
# TODO: when all of the model scripts handle their own names, can eliminate this argument
Expand Down
15 changes: 15 additions & 0 deletions stanza/utils/training/run_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,15 @@ def run_treebank(mode, paths, treebank, short_name,
if short_language == "zh" or short_language.startswith("zh-"):
extra_args = ["--skip_newline"] + extra_args

train_gold = f"{tokenize_dir}/{short_name}.train.gold.conllu"
dev_gold = f"{tokenize_dir}/{short_name}.dev.gold.conllu"
test_gold = f"{tokenize_dir}/{short_name}.test.gold.conllu"

train_mwt = f"{tokenize_dir}/{short_name}-ud-train-mwt.json"
dev_mwt = f"{tokenize_dir}/{short_name}-ud-dev-mwt.json"
test_mwt = f"{tokenize_dir}/{short_name}-ud-test-mwt.json"

train_pred = temp_output_file if temp_output_file else f"{tokenize_dir}/{short_name}.train.pred.conllu"
dev_pred = temp_output_file if temp_output_file else f"{tokenize_dir}/{short_name}.dev.pred.conllu"
test_pred = temp_output_file if temp_output_file else f"{tokenize_dir}/{short_name}.test.pred.conllu"

Expand Down Expand Up @@ -102,6 +105,18 @@ def run_treebank(mode, paths, treebank, short_name,
results = common.run_eval_script_tokens(test_gold, test_pred)
logger.info("Finished running test set on\n{}\n{}".format(treebank, results))

if mode == Mode.SCORE_TRAIN:
test_args = ["--mode", "predict", test_type, train_file, "--lang", short_language,
"--conll_file", train_pred, "--shorthand", short_name, "--mwt_json_file", train_mwt]
test_args = test_args + extra_args
logger.info("Running test step with args: {}".format(test_args))
tokenizer.main(test_args)

results = common.run_eval_script_tokens(train_gold, train_pred)
logger.info("Finished running train set as a test on\n{}\n{}".format(treebank, results))



def main():
common.main(run_treebank, "tokenize", "tokenizer", sub_argparse=tokenizer.build_argparse())

Expand Down

0 comments on commit a877a3c

Please sign in to comment.