diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8a00955c..274f6c6e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,3 +50,7 @@ jobs: shell: bash run: | python -m poetry install + + - name: Test with pytest + run: | + poetry run pytest -m "not deezy" \ No newline at end of file diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index c4a08fc6..2273ff4f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -49,7 +49,7 @@ jobs: - name: Install dependencies shell: bash run: | - python -m poetry install --extras docs + python -m poetry install --with docs - name: Build documentation run: | diff --git a/.gitignore b/.gitignore index 0eac6891..54182d4c 100644 --- a/.gitignore +++ b/.gitignore @@ -129,18 +129,18 @@ dmypy.json .pyre/ -outputs/ -resources/ +/experiments/outputs/ +/resources/ poetry.lock .vscode/* -evaluation/results/* -evaluation/CLEF-HIPE-2020-scorer/ -experiments/tmp_* +/evaluation/results/* +/evaluation/HIPE-scorer/ +/experiments/tmp_* preprocessing/toponymmatching/experiments/ -experiments/REL/ -evaluation/results_table.pkl -experiments/explore_data.ipynb -experiments/examine_res.py +/experiments/REL/ +/evaluation/results_table.pkl +/experiments/explore_data.ipynb +/experiments/examine_res.py # Docs _build diff --git a/README.md b/README.md index 120cda15..802a575c 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,9 @@ T-Res relies on several resources in the following directory structure: ``` T-Res/ +├── t-res/ +│ ├── geoparser/ +│ └── utils/ ├── app/ ├── evaluation/ ├── examples/ @@ -38,11 +41,10 @@ T-Res/ │ ├── linking_df_split.tsv [*?] │ ├── ner_fine_dev.json [*+?] │ └── ner_fine_train.json [*+?] -├── geoparser/ ├── resources/ │ ├── deezymatch/ │ │ └── data/ -│ │ └── w2v_ocr_pairs.txt [*+?] +│ │ └── w2v_ocr_pairs.txt [?] │ ├── models/ │ ├── news_datasets/ │ ├── rel_db/ @@ -53,8 +55,7 @@ T-Res/ │ ├── mentions_to_wikidata.json [*] │ ├── wikidta_gazetteer.csv [*] │ └── wikidata_to_mentions_normalized.json [*] -├── tests/ -└── utils/ +└── tests/ ``` These resources are described in detail in the documentation. A question mark (`?`) is used to indicate resources which are only required for some approaches (for example, the `rel_db/embeddings_database.db` file is only required by the REL-based disambiguation approaches). Note that an asterisk (`*`) next to the resource means that the path can be changed when instantiating the T-Res objects, and a plus sign (`+`) if the name of the file can be changed in the instantiation. @@ -68,7 +69,7 @@ This is an example on how to use the default T-Res pipeline: ```python from geoparser import pipeline -geoparser = pipeline.Pipeline() +geoparser = pipeline.Pipeline(resources_path="./resources") output = geoparser.run_text("She was on a visit at Chippenham.") ``` diff --git a/app/app_template.py b/app/app_template.py index a034b275..92e435ae 100644 --- a/app/app_template.py +++ b/app/app_template.py @@ -8,18 +8,9 @@ from fastapi import FastAPI, Request from pydantic import BaseModel -if "toponym-resolution" in __file__: - root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -else: - root_path = os.path.dirname(os.path.abspath(__file__)) -experiments_path = Path(root_path, "experiments") -sys.path.insert(0, str(root_path)) -sys.path.insert(0, str(experiments_path)) -os.chdir(experiments_path) - from config import CONFIG as pipeline_config -from geoparser import pipeline +from t_res.geoparser import pipeline geoparser = pipeline.Pipeline(**pipeline_config) diff --git a/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py b/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py index 574fc777..e4e6468f 100644 --- a/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py +++ b/app/configs/t-res_deezy_reldisamb-wpubl-wmtops.py @@ -1,29 +1,26 @@ -import os -import sys import sqlite3 from pathlib import Path -# sys.path.insert(0, os.path.abspath(os.path.pardir)) -from geoparser import pipeline, ranking, linking +from t_res.geoparser import linking, pipeline, ranking # -------------------------------------- # Instantiate the ranker: myranker = ranking.Ranker( method="deezymatch", - resources_path="../resources/wikidata/", + resources_path="./resources/", strvar_parameters={ # Parameters to create the string pair dataset: "ocr_threshold": 60, "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("../resources/models/w2v/").resolve()), + "w2v_ocr_path": str(Path("./resources/models/w2v/").resolve()), "w2v_ocr_model": "w2v_*_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("../resources/deezymatch/").resolve()), + "dm_path": str(Path("./resources/deezymatch/").resolve()), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -38,15 +35,16 @@ }, ) -with sqlite3.connect("../resources/rel_db/embeddings_database.db") as conn: +with sqlite3.connect("./resources/rel_db/embeddings_database.db") as conn: cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="../resources/", + resources_path="./resources/", + experiments_path="./experiments/", linking_resources=dict(), rel_params={ - "model_path": "../resources/models/disambiguation/", - "data_path": "outputs/data/lwm/", + "model_path": "./resources/models/disambiguation/", + "data_path": "./experiments/outputs/data/lwm/", "training_split": "originalsplit", "db_embeddings": cursor, "with_publication": True, diff --git a/app/run_local_app.py b/app/run_local_app.py index b5ebeb33..9afd86b0 100755 --- a/app/run_local_app.py +++ b/app/run_local_app.py @@ -1,33 +1,23 @@ +import importlib import os import sys import time from pathlib import Path -from typing import Union, Optional, List +from typing import List, Optional, Union import uvicorn from fastapi import FastAPI, Request from pydantic import BaseModel -if "toponym-resolution" in __file__: - root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -else: - root_path = os.path.dirname(os.path.abspath(__file__)) -experiments_path = Path(root_path, "experiments") -sys.path.insert(0, str(root_path)) -sys.path.insert(0, str(experiments_path)) -os.chdir(experiments_path) +from t_res.geoparser import pipeline os.environ["APP_CONFIG_NAME"] = "t-res_deezy_reldisamb-wpubl-wmtops" -import importlib config_mod = importlib.import_module( ".t-res_deezy_reldisamb-wpubl-wmtops", "app.configs" ) pipeline_config = config_mod.CONFIG - -from geoparser import pipeline - geoparser = pipeline.Pipeline(**pipeline_config) diff --git a/app/template.Dockerfile b/app/template.Dockerfile index 9bd4fc25..b1484973 100644 --- a/app/template.Dockerfile +++ b/app/template.Dockerfile @@ -4,10 +4,11 @@ ARG APP_NAME WORKDIR /app COPY pyproject.toml /app/pyproject.toml +COPY t_res /app/t_res RUN pip3 install poetry RUN poetry config virtualenvs.create false -RUN poetry install --no-dev +RUN poetry install ENV APP_CONFIG_NAME=${APP_NAME} COPY app/app_template.py /app/app.py diff --git a/docs/source/experiments/index.rst b/docs/source/experiments/index.rst index b08d24f4..a1deead1 100644 --- a/docs/source/experiments/index.rst +++ b/docs/source/experiments/index.rst @@ -6,7 +6,7 @@ Follow these steps to reproduce the experiments in our paper. 1. Obtain the external resources -------------------------------- -Follow the instructions in the ":doc:`resources`" page in the documentation +Follow the instructions in the ":doc:`/getting-started/resources`" page in the documentation to obtain the resources required for running the experiments. 2. Preparing the data @@ -17,7 +17,7 @@ run the following command from the ``./experiments/`` folder: .. code-block:: bash - $ python ./prepare_data.py + $ python ./prepare_data.py -p ../resources This script takes care of downloading the LwM and HIPE datasets and format them as needed in the experiments. @@ -30,7 +30,7 @@ folder: .. code-block:: bash - $ python ./toponym_resolution.py + $ python ./toponym_resolution.py -p ../resources This script does runs for all different scenarios reported in the experiments in the paper. diff --git a/docs/source/getting-started/complete-tour.rst b/docs/source/getting-started/complete-tour.rst index bee8401a..bf48f8df 100644 --- a/docs/source/getting-started/complete-tour.rst +++ b/docs/source/getting-started/complete-tour.rst @@ -47,7 +47,9 @@ To instantiate the default T-Res pipeline, do: from geoparser import pipeline - geoparser = pipeline.Pipeline() + geoparser = pipeline.Pipeline(resources_path="../resources/") + +.. note:: You should update the resources path argument to reflect your set up. You can also instantiate a pipeline using a customised Recogniser, Ranker and Linker. To see the different options, refer to the sections on instantiating @@ -603,7 +605,7 @@ and ``levenshtein`` respectively), instantiate it as follows, changing the myranker = ranking.Ranker( method="perfectmatch", # or "partialmatch" or "levenshtein" - resources_path="resources/wikidata/", + resources_path="resources/", ) Note that ``resources_path`` should contain the path to the directory @@ -668,7 +670,7 @@ The Ranker can then be instantiated as follows: myranker = ranking.Ranker( # Generic Ranker parameters: method="deezymatch", - resources_path="resources/wikidata/", + resources_path="resources/", # Parameters to create the string pair dataset: strvar_parameters=dict(), # Parameters to train, load and use a DeezyMatch model: @@ -757,7 +759,7 @@ The Ranker can then be instantiated as follows: myranker = ranking.Ranker( # Generic Ranker parameters: method="deezymatch", - resources_path="resources/wikidata/", + resources_path="resources/", # Parameters to create the string pair dataset: strvar_parameters={ "ocr_threshold": 60, diff --git a/docs/source/getting-started/resources.rst b/docs/source/getting-started/resources.rst index 6b72addc..8745d4c8 100644 --- a/docs/source/getting-started/resources.rst +++ b/docs/source/getting-started/resources.rst @@ -561,6 +561,9 @@ for the mentioned resources that are required in order to run the pipeline. :: T-Res/ + ├── t-res/ + │ ├── geoparser/ + │ └── utils/ ├── app/ ├── evaluation/ ├── examples/ @@ -571,7 +574,6 @@ for the mentioned resources that are required in order to run the pipeline. │ ├── linking_df_split.tsv [*?] │ ├── ner_fine_dev.json [*+?] │ └── ner_fine_train.json [*+?] - ├── geoparser/ ├── resources/ │ ├── deezymatch/ │ │ └── data/ @@ -586,8 +588,7 @@ for the mentioned resources that are required in order to run the pipeline. │ ├── mentions_to_wikidata.json [*] │ ├── wikidta_gazetteer.csv [*] │ └── wikidata_to_mentions_normalized.json [*] - ├── tests/ - └── utils/ + └── tests/ A question mark (``?``) is used to indicate resources which are only required for some approaches (for example, the ``rel_db/embeddings_database.db`` file diff --git a/docs/source/reference/geoparser/linker.rst b/docs/source/reference/geoparser/linker.rst index 26ee9990..e6bb8091 100644 --- a/docs/source/reference/geoparser/linker.rst +++ b/docs/source/reference/geoparser/linker.rst @@ -1,8 +1,8 @@ -``geoparser.linking.Linker`` +``t_res.geoparser.linking.Linker`` ============================ -.. autoclass:: geoparser.linking.Linker +.. autoclass:: t_res.geoparser.linking.Linker :members: :undoc-members: -.. autoattribute:: geoparser.linking.RANDOM_SEED \ No newline at end of file +.. autoattribute:: t_res.geoparser.linking.RANDOM_SEED \ No newline at end of file diff --git a/docs/source/reference/geoparser/pipeline.rst b/docs/source/reference/geoparser/pipeline.rst index 392610ae..95e68b45 100644 --- a/docs/source/reference/geoparser/pipeline.rst +++ b/docs/source/reference/geoparser/pipeline.rst @@ -1,6 +1,6 @@ -``geoparser.pipeline.Pipeline`` +``t_res.geoparser.pipeline.Pipeline`` =============================== -.. autoclass:: geoparser.pipeline.Pipeline +.. autoclass:: t_res.geoparser.pipeline.Pipeline :members: :undoc-members: diff --git a/docs/source/reference/geoparser/ranker.rst b/docs/source/reference/geoparser/ranker.rst index 659f928a..c31dd884 100644 --- a/docs/source/reference/geoparser/ranker.rst +++ b/docs/source/reference/geoparser/ranker.rst @@ -1,6 +1,6 @@ -``geoparser.ranking. Ranker`` +``t_res.geoparser.ranking. Ranker`` ============================= -.. autoclass:: geoparser.ranking.Ranker +.. autoclass:: t_res.geoparser.ranking.Ranker :members: :undoc-members: diff --git a/docs/source/reference/geoparser/recogniser.rst b/docs/source/reference/geoparser/recogniser.rst index 5b4543ca..d437b140 100644 --- a/docs/source/reference/geoparser/recogniser.rst +++ b/docs/source/reference/geoparser/recogniser.rst @@ -1,6 +1,6 @@ -``geoparser.recogniser.Recogniser`` +``t_res.geoparser.recogniser.Recogniser`` =================================== -.. autoclass:: geoparser.recogniser.Recogniser +.. autoclass:: t_res.geoparser.recogniser.Recogniser :members: :undoc-members: diff --git a/docs/source/reference/utils/deezy_processing.rst b/docs/source/reference/utils/deezy_processing.rst index 6f9ae76f..aa80e247 100644 --- a/docs/source/reference/utils/deezy_processing.rst +++ b/docs/source/reference/utils/deezy_processing.rst @@ -1,10 +1,10 @@ -``utils.deezy_processing`` module +``t_res.utils.deezy_processing`` module ================================= -.. autofunction:: utils.deezy_processing.obtain_matches +.. autofunction:: t_res.utils.deezy_processing.obtain_matches -.. autofunction:: utils.deezy_processing.create_training_set +.. autofunction:: t_res.utils.deezy_processing.create_training_set -.. autofunction:: utils.deezy_processing.train_deezy_model +.. autofunction:: t_res.utils.deezy_processing.train_deezy_model -.. autofunction:: utils.deezy_processing.generate_candidates \ No newline at end of file +.. autofunction:: t_res.utils.deezy_processing.generate_candidates \ No newline at end of file diff --git a/docs/source/reference/utils/get_data.rst b/docs/source/reference/utils/get_data.rst index f3edecb1..c3016cf1 100644 --- a/docs/source/reference/utils/get_data.rst +++ b/docs/source/reference/utils/get_data.rst @@ -1,6 +1,6 @@ -``utils.get_data`` module +``t_res.utils.get_data`` module ========================= -.. autofunction:: utils.get_data.download_lwm_data +.. autofunction:: t_res.utils.get_data.download_lwm_data -.. autofunction:: utils.get_data.download_hipe_data \ No newline at end of file +.. autofunction:: t_res.utils.get_data.download_hipe_data \ No newline at end of file diff --git a/docs/source/reference/utils/ner.rst b/docs/source/reference/utils/ner.rst index 363f5484..d8d3dc0b 100644 --- a/docs/source/reference/utils/ner.rst +++ b/docs/source/reference/utils/ner.rst @@ -1,18 +1,18 @@ -``utils.ner`` module +``t_res.utils.ner`` module ==================== -.. autofunction:: utils.ner.training_tokenize_and_align_labels +.. autofunction:: t_res.utils.ner.training_tokenize_and_align_labels -.. autofunction:: utils.ner.collect_named_entities +.. autofunction:: t_res.utils.ner.collect_named_entities -.. autofunction:: utils.ner.aggregate_mentions +.. autofunction:: t_res.utils.ner.aggregate_mentions -.. autofunction:: utils.ner.fix_capitalization +.. autofunction:: t_res.utils.ner.fix_capitalization -.. autofunction:: utils.ner.fix_hyphens +.. autofunction:: t_res.utils.ner.fix_hyphens -.. autofunction:: utils.ner.fix_nested +.. autofunction:: t_res.utils.ner.fix_nested -.. autofunction:: utils.ner.fix_startEntity +.. autofunction:: t_res.utils.ner.fix_startEntity -.. autofunction:: utils.ner.aggregate_entities \ No newline at end of file +.. autofunction:: t_res.utils.ner.aggregate_entities \ No newline at end of file diff --git a/docs/source/reference/utils/preprocess_data.rst b/docs/source/reference/utils/preprocess_data.rst index 938773a5..73c73d8b 100644 --- a/docs/source/reference/utils/preprocess_data.rst +++ b/docs/source/reference/utils/preprocess_data.rst @@ -1,20 +1,20 @@ -``utils.preprocess_data`` module +``t_res.utils.preprocess_data`` module ================================ -.. automodule:: utils.preprocess_data +.. automodule:: t_res.utils.preprocess_data -.. autofunction:: utils.preprocess_data.turn_wikipedia2wikidata +.. autofunction:: t_res.utils.preprocess_data.turn_wikipedia2wikidata -.. autofunction:: utils.preprocess_data.reconstruct_sentences +.. autofunction:: t_res.utils.preprocess_data.reconstruct_sentences -.. autofunction:: utils.preprocess_data.process_lwm_for_ner +.. autofunction:: t_res.utils.preprocess_data.process_lwm_for_ner -.. autofunction:: utils.preprocess_data.process_lwm_for_linking +.. autofunction:: t_res.utils.preprocess_data.process_lwm_for_linking -.. autofunction:: utils.preprocess_data.aggregate_hipe_entities +.. autofunction:: t_res.utils.preprocess_data.aggregate_hipe_entities -.. autofunction:: utils.preprocess_data.process_hipe_for_linking +.. autofunction:: t_res.utils.preprocess_data.process_hipe_for_linking -.. autofunction:: utils.preprocess_data.process_tsv +.. autofunction:: t_res.utils.preprocess_data.process_tsv -.. autofunction:: utils.preprocess_data.fine_to_coarse \ No newline at end of file +.. autofunction:: t_res.utils.preprocess_data.fine_to_coarse \ No newline at end of file diff --git a/docs/source/reference/utils/process_data.rst b/docs/source/reference/utils/process_data.rst index 1f5f2066..25798b04 100644 --- a/docs/source/reference/utils/process_data.rst +++ b/docs/source/reference/utils/process_data.rst @@ -1,20 +1,20 @@ -``utils.process_data`` module +``t_res.utils.process_data`` module ============================= -.. autofunction:: utils.process_data.eval_with_exception +.. autofunction:: t_res.utils.process_data.eval_with_exception -.. autofunction:: utils.process_data.prepare_sents +.. autofunction:: t_res.utils.process_data.prepare_sents -.. autofunction:: utils.process_data.align_gold +.. autofunction:: t_res.utils.process_data.align_gold -.. autofunction:: utils.process_data.postprocess_predictions +.. autofunction:: t_res.utils.process_data.postprocess_predictions -.. autofunction:: utils.process_data.ner_and_process +.. autofunction:: t_res.utils.process_data.ner_and_process -.. autofunction:: utils.process_data.update_with_linking +.. autofunction:: t_res.utils.process_data.update_with_linking -.. autofunction:: utils.process_data.update_with_skyline +.. autofunction:: t_res.utils.process_data.update_with_skyline -.. autofunction:: utils.process_data.prepare_storing_links +.. autofunction:: t_res.utils.process_data.prepare_storing_links -.. autofunction:: utils.process_data.store_for_scorer +.. autofunction:: t_res.utils.process_data.store_for_scorer diff --git a/docs/source/reference/utils/process_wikipedia.rst b/docs/source/reference/utils/process_wikipedia.rst index 69f7e686..807ef9ee 100644 --- a/docs/source/reference/utils/process_wikipedia.rst +++ b/docs/source/reference/utils/process_wikipedia.rst @@ -1,8 +1,8 @@ -``utils.process_wikipedia`` module +``t_res.utils.process_wikipedia`` module ================================== -.. autofunction:: utils.process_wikipedia.make_wikilinks_consistent +.. autofunction:: t_res.utils.process_wikipedia.make_wikilinks_consistent -.. autofunction:: utils.process_wikipedia.make_wikipedia2wikidata_consisent +.. autofunction:: t_res.utils.process_wikipedia.make_wikipedia2wikidata_consisent -.. autofunction:: utils.process_wikipedia.title_to_id \ No newline at end of file +.. autofunction:: t_res.utils.process_wikipedia.title_to_id \ No newline at end of file diff --git a/docs/source/reference/utils/rel/entity_disambiguation.rst b/docs/source/reference/utils/rel/entity_disambiguation.rst index 1ace598e..9a690635 100644 --- a/docs/source/reference/utils/rel/entity_disambiguation.rst +++ b/docs/source/reference/utils/rel/entity_disambiguation.rst @@ -1,8 +1,8 @@ -``utils.REL.entity_disambiguation`` module +``t_res.utils.REL.entity_disambiguation`` module ========================================== -.. autoclass:: utils.REL.entity_disambiguation.EntityDisambiguation +.. autoclass:: t_res.utils.REL.entity_disambiguation.EntityDisambiguation :members: :undoc-members: -.. autoattribute:: utils.REL.entity_disambiguation.RANDOM_SEED \ No newline at end of file +.. autoattribute:: t_res.utils.REL.entity_disambiguation.RANDOM_SEED \ No newline at end of file diff --git a/docs/source/reference/utils/rel/mulrel_ranker.rst b/docs/source/reference/utils/rel/mulrel_ranker.rst index 7e4e77ea..a7352632 100644 --- a/docs/source/reference/utils/rel/mulrel_ranker.rst +++ b/docs/source/reference/utils/rel/mulrel_ranker.rst @@ -1,10 +1,10 @@ -``utils.REL.mulrel_ranker`` module +``t_res.utils.REL.mulrel_ranker`` module ================================== -.. autoclass:: utils.REL.mulrel_ranker.PreRank +.. autoclass:: t_res.utils.REL.mulrel_ranker.PreRank :members: :undoc-members: -.. autoclass:: utils.REL.mulrel_ranker.MulRelRanker +.. autoclass:: t_res.utils.REL.mulrel_ranker.MulRelRanker :members: :undoc-members: diff --git a/docs/source/reference/utils/rel/utils.rst b/docs/source/reference/utils/rel/utils.rst index 74641788..1597f964 100644 --- a/docs/source/reference/utils/rel/utils.rst +++ b/docs/source/reference/utils/rel/utils.rst @@ -1,10 +1,10 @@ -``utils.REL.utils`` module +``t_res.utils.REL.t_res.utils`` module ========================== -.. autofunction:: utils.REL.utils.flatten_list_of_lists +.. autofunction:: t_res.utils.REL.t_res.utils.flatten_list_of_lists -.. autofunction:: utils.REL.utils.make_equal_len +.. autofunction:: t_res.utils.REL.t_res.utils.make_equal_len -.. autofunction:: utils.REL.utils.is_important_word +.. autofunction:: t_res.utils.REL.t_res.utils.is_important_word -.. autoattribute:: utils.REL.utils.STOPWORDS \ No newline at end of file +.. autoattribute:: t_res.utils.REL.t_res.utils.STOPWORDS \ No newline at end of file diff --git a/docs/source/reference/utils/rel/vocabulary.rst b/docs/source/reference/utils/rel/vocabulary.rst index 5ab8da92..3516423d 100644 --- a/docs/source/reference/utils/rel/vocabulary.rst +++ b/docs/source/reference/utils/rel/vocabulary.rst @@ -1,6 +1,6 @@ -``utils.REL.vocabulary`` module +``t_res.utils.REL.vocabulary`` module =============================== -.. autoclass:: utils.REL.vocabulary.Vocabulary +.. autoclass:: t_res.utils.REL.vocabulary.Vocabulary :members: :undoc-members: diff --git a/docs/source/reference/utils/rel_e2e.rst b/docs/source/reference/utils/rel_e2e.rst index 7145d30c..64c3130a 100644 --- a/docs/source/reference/utils/rel_e2e.rst +++ b/docs/source/reference/utils/rel_e2e.rst @@ -1,16 +1,16 @@ -``utils.rel_e2e`` module +``t_res.utils.rel_e2e`` module ======================== -.. autofunction:: utils.rel_e2e.rel_end_to_end +.. autofunction:: t_res.utils.rel_e2e.rel_end_to_end -.. autofunction:: utils.rel_e2e.get_rel_from_api +.. autofunction:: t_res.utils.rel_e2e.get_rel_from_api -.. autofunction:: utils.rel_e2e.match_wikipedia_to_wikidata +.. autofunction:: t_res.utils.rel_e2e.match_wikipedia_to_wikidata -.. autofunction:: utils.rel_e2e.match_ent +.. autofunction:: t_res.utils.rel_e2e.match_ent -.. autofunction:: utils.rel_e2e.postprocess_rel +.. autofunction:: t_res.utils.rel_e2e.postprocess_rel -.. autofunction:: utils.rel_e2e.store_rel +.. autofunction:: t_res.utils.rel_e2e.store_rel -.. autofunction:: utils.rel_e2e.run_rel_experiments \ No newline at end of file +.. autofunction:: t_res.utils.rel_e2e.run_rel_experiments \ No newline at end of file diff --git a/docs/source/reference/utils/rel_utils.rst b/docs/source/reference/utils/rel_utils.rst index d3fb3638..0ce4cb52 100644 --- a/docs/source/reference/utils/rel_utils.rst +++ b/docs/source/reference/utils/rel_utils.rst @@ -1,14 +1,14 @@ -``utils.rel_utils`` module +``t_res.utils.rel_utils`` module ========================== -.. autofunction:: utils.rel_utils.get_db_emb +.. autofunction:: t_res.utils.rel_utils.get_db_emb -.. autofunction:: utils.rel_utils.eval_with_exception +.. autofunction:: t_res.utils.rel_utils.eval_with_exception -.. autofunction:: utils.rel_utils.prepare_initial_data +.. autofunction:: t_res.utils.rel_utils.prepare_initial_data -.. autofunction:: utils.rel_utils.rank_candidates +.. autofunction:: t_res.utils.rel_utils.rank_candidates -.. autofunction:: utils.rel_utils.add_publication +.. autofunction:: t_res.utils.rel_utils.add_publication -.. autofunction:: utils.rel_utils.prepare_rel_trainset \ No newline at end of file +.. autofunction:: t_res.utils.rel_utils.prepare_rel_trainset \ No newline at end of file diff --git a/evaluation/README.md b/evaluation/README.md index d4f44b2c..ad1f4b19 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -4,7 +4,7 @@ First, clone the [CLEF-HIPE-2020-scorer](https://github.com/impresso/CLEF-HIPE-2 ``` git clone https://github.com/impresso/CLEF-HIPE-2020-scorer.git -cd CLEF-HIPE-2020-scorer +cd HIPE-scorer git checkout ac5c876eba58065195024cff550c2b5056986f7b ``` @@ -12,10 +12,10 @@ Then, to run the script: To assess the performance on toponym recognition: ```bash -python CLEF-HIPE-2020-scorer/clef_evaluation.py --ref ../experiments/outputs/results/lwm-true_bundle2_en_1.tsv --pred ../experiments/outputs/results/lwm-pred_bundle2_en_1.tsv --task nerc_coarse --outdir results/ +python HIPE-scorer/clef_evaluation.py --ref ../experiments/outputs/results/lwm-true_bundle2_en_1.tsv --pred ../experiments/outputs/results/lwm-pred_bundle2_en_1.tsv --task nerc_coarse --outdir results/ ``` To assess the performance on toponym resolution: ```bash -python CLEF-HIPE-2020-scorer/clef_evaluation.py --ref ../experiments/outputs/results/lwm-true_bundle2_en_1.tsv --pred ../experiments/outputs/results/lwm-pred_bundle2_en_1.tsv --task nel --outdir results/ +python HIPE-scorer/clef_evaluation.py --ref ../experiments/outputs/results/lwm-true_bundle2_en_1.tsv --pred ../experiments/outputs/results/lwm-pred_bundle2_en_1.tsv --task nel --outdir results/ ``` diff --git a/evaluation/display_results.py b/evaluation/display_results.py index e42718cd..67e32af0 100644 --- a/evaluation/display_results.py +++ b/evaluation/display_results.py @@ -9,8 +9,8 @@ "ignore", category=FutureWarning ) # To fix properly in the future -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath("HIPE-scorer/")) +# Add "./HIPE-scorer" to path to import HIPE-scorer +sys.path.insert(0, os.path.abspath("./HIPE-scorer/")) import clef_evaluation dApprNames = dict() diff --git a/examples/load_use_ner_model.ipynb b/examples/load_use_ner_model.ipynb index 6890cfd9..c4be7f47 100644 --- a/examples/load_use_ner_model.ipynb +++ b/examples/load_use_ner_model.ipynb @@ -21,8 +21,8 @@ "import os\n", "import sys\n", "\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import recogniser" + "\n", + "from t_res.geoparser import recogniser" ] }, { @@ -135,7 +135,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/run_pipeline_basic.ipynb b/examples/run_pipeline_basic.ipynb index 17417872..aa8aba10 100644 --- a/examples/run_pipeline_basic.ipynb +++ b/examples/run_pipeline_basic.ipynb @@ -19,8 +19,7 @@ "import os\n", "import sys\n", "\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import pipeline" + "from t_res.geoparser import pipeline" ] }, { @@ -37,7 +36,7 @@ "metadata": {}, "outputs": [], "source": [ - "geoparser = pipeline.Pipeline()" + "geoparser = pipeline.Pipeline(resources_path=\"../resources/\")" ] }, { @@ -134,6 +133,20 @@ "disamb_output = geoparser.run_disambiguation(mentions, candidates)\n", "print(disamb_output)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -152,7 +165,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/run_pipeline_deezy_mostpopular.ipynb b/examples/run_pipeline_deezy_mostpopular.ipynb index 8392dd90..9129b8b3 100644 --- a/examples/run_pipeline_deezy_mostpopular.ipynb +++ b/examples/run_pipeline_deezy_mostpopular.ipynb @@ -17,8 +17,8 @@ "import os\n", "import sys\n", "from pathlib import Path\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import pipeline, ranking, linking" + "\n", + "from t_res.geoparser import pipeline, ranking, linking" ] }, { @@ -31,7 +31,7 @@ "# Instantiate the ranker:\n", "myranker = ranking.Ranker(\n", " method=\"deezymatch\",\n", - " resources_path=\"../resources/wikidata/\",\n", + " resources_path=\"../resources/\",\n", " strvar_parameters={\n", " # Parameters to create the string pair dataset:\n", " \"ocr_threshold\": 60,\n", @@ -91,6 +91,13 @@ "for r in resolved:\n", " print(r)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -109,7 +116,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/run_pipeline_deezy_reldisamb+wmtops.ipynb b/examples/run_pipeline_deezy_reldisamb+wmtops.ipynb index b79ec83d..8f89e400 100644 --- a/examples/run_pipeline_deezy_reldisamb+wmtops.ipynb +++ b/examples/run_pipeline_deezy_reldisamb+wmtops.ipynb @@ -20,8 +20,8 @@ "import sys\n", "import sqlite3\n", "from pathlib import Path\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import pipeline, ranking, linking" + "\n", + "from t_res.geoparser import pipeline, ranking, linking" ] }, { @@ -34,7 +34,7 @@ "# Instantiate the ranker:\n", "myranker = ranking.Ranker(\n", " method=\"deezymatch\",\n", - " resources_path=\"../resources/wikidata/\",\n", + " resources_path=\"../resources/\",\n", " strvar_parameters=dict(),\n", " deezy_parameters={\n", " # Paths and filenames of DeezyMatch models and data:\n", diff --git a/examples/run_pipeline_deezy_reldisamb+wpubl+wmtops.ipynb b/examples/run_pipeline_deezy_reldisamb+wpubl+wmtops.ipynb index f7b9ec99..6e74593f 100644 --- a/examples/run_pipeline_deezy_reldisamb+wpubl+wmtops.ipynb +++ b/examples/run_pipeline_deezy_reldisamb+wpubl+wmtops.ipynb @@ -20,8 +20,8 @@ "import sys\n", "import sqlite3\n", "from pathlib import Path\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import pipeline, ranking, linking" + "\n", + "from t_res.geoparser import pipeline, ranking, linking" ] }, { @@ -34,7 +34,7 @@ "# Instantiate the ranker:\n", "myranker = ranking.Ranker(\n", " method=\"deezymatch\",\n", - " resources_path=\"../resources/wikidata/\",\n", + " resources_path=\"../resources/\",\n", " strvar_parameters={\n", " # Parameters to create the string pair dataset:\n", " \"ocr_threshold\": 60,\n", diff --git a/examples/run_pipeline_deezy_reldisamb+wpubl.ipynb b/examples/run_pipeline_deezy_reldisamb+wpubl.ipynb index 3375ce41..688a81de 100644 --- a/examples/run_pipeline_deezy_reldisamb+wpubl.ipynb +++ b/examples/run_pipeline_deezy_reldisamb+wpubl.ipynb @@ -20,8 +20,8 @@ "import sys\n", "import sqlite3\n", "from pathlib import Path\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import pipeline, ranking, linking" + "\n", + "from t_res.geoparser import pipeline, ranking, linking" ] }, { @@ -34,7 +34,7 @@ "# Instantiate the ranker:\n", "myranker = ranking.Ranker(\n", " method=\"deezymatch\",\n", - " resources_path=\"../resources/wikidata/\",\n", + " resources_path=\"../resources/\",\n", " strvar_parameters=dict(),\n", " deezy_parameters={\n", " # Paths and filenames of DeezyMatch models and data:\n", diff --git a/examples/run_pipeline_deezy_reldisamb.ipynb b/examples/run_pipeline_deezy_reldisamb.ipynb index 4d7bf262..445c1a7d 100644 --- a/examples/run_pipeline_deezy_reldisamb.ipynb +++ b/examples/run_pipeline_deezy_reldisamb.ipynb @@ -20,8 +20,8 @@ "import sys\n", "import sqlite3\n", "from pathlib import Path\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import pipeline, ranking, linking" + "\n", + "from t_res.geoparser import pipeline, ranking, linking" ] }, { @@ -34,7 +34,7 @@ "# Instantiate the ranker:\n", "myranker = ranking.Ranker(\n", " method=\"deezymatch\",\n", - " resources_path=\"../resources/wikidata/\",\n", + " resources_path=\"../resources/\",\n", " mentions_to_wikidata=dict(),\n", " wikidata_to_mentions=dict(),\n", " strvar_parameters={\n", @@ -125,6 +125,13 @@ "for r in resolved:\n", " print(r)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -143,7 +150,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/run_pipeline_modular.ipynb b/examples/run_pipeline_modular.ipynb index 091422b9..40e5aac1 100644 --- a/examples/run_pipeline_modular.ipynb +++ b/examples/run_pipeline_modular.ipynb @@ -10,8 +10,8 @@ "import sys\n", "import sqlite3\n", "from pathlib import Path\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import pipeline, ranking, linking" + "\n", + "from t_res.geoparser import pipeline, ranking, linking" ] }, { @@ -24,7 +24,7 @@ "# Instantiate the ranker:\n", "myranker = ranking.Ranker(\n", " method=\"deezymatch\",\n", - " resources_path=\"../resources/wikidata/\",\n", + " resources_path=\"../resources/\",\n", " strvar_parameters={\n", " # Parameters to create the string pair dataset:\n", " \"ocr_threshold\": 60,\n", @@ -135,6 +135,13 @@ "source": [ "output_disamb" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -153,7 +160,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/run_pipeline_perfect_mostpopular.ipynb b/examples/run_pipeline_perfect_mostpopular.ipynb index 7f6cae50..4a11aa63 100644 --- a/examples/run_pipeline_perfect_mostpopular.ipynb +++ b/examples/run_pipeline_perfect_mostpopular.ipynb @@ -17,8 +17,7 @@ "import os\n", "import sys\n", "\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import pipeline, ranking, linking" + "from t_res.geoparser import pipeline, ranking, linking" ] }, { @@ -29,7 +28,7 @@ "source": [ "myranker = ranking.Ranker(\n", " method=\"perfectmatch\",\n", - " resources_path=\"../resources/wikidata/\",\n", + " resources_path=\"../resources/\",\n", ")\n" ] }, @@ -64,6 +63,13 @@ "for r in resolved:\n", " print(r)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -82,7 +88,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/train_use_deezy_model_1.ipynb b/examples/train_use_deezy_model_1.ipynb index e2ce98f9..f379360a 100644 --- a/examples/train_use_deezy_model_1.ipynb +++ b/examples/train_use_deezy_model_1.ipynb @@ -52,8 +52,7 @@ "import sys\n", "from pathlib import Path\n", "\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import ranking" + "from t_res.geoparser import ranking" ] }, { @@ -72,7 +71,7 @@ "source": [ "myranker = ranking.Ranker(\n", " method=\"deezymatch\", # Here we're telling the ranker to use DeezyMatch.\n", - " resources_path=\"../resources/wikidata/\", # Here, the path to the Wikidata resources.\n", + " resources_path=\"../resources/\", # Here, the path to the Wikidata resources.\n", " # Parameters to create the string pair dataset:\n", " strvar_parameters={\n", " \"ocr_threshold\": 60,\n", @@ -154,8 +153,26 @@ "source": [ "# Find candidates given a toponym:\n", "toponym = \"Manchefter\"\n", - "print(myranker.find_candidates([{\"mention\": toponym}])[0][toponym])" + "print(myranker.find_candidates([{\"mention\": toponym}])[toponym])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Find candidates given a toponym:\n", + "toponym = \"Londen\"\n", + "print(myranker.find_candidates([{\"mention\": toponym}])[toponym])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -174,7 +191,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/train_use_deezy_model_2.ipynb b/examples/train_use_deezy_model_2.ipynb index 5045329e..4214457f 100644 --- a/examples/train_use_deezy_model_2.ipynb +++ b/examples/train_use_deezy_model_2.ipynb @@ -47,8 +47,7 @@ "import sys\n", "from pathlib import Path\n", "\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import ranking" + "from t_res.geoparser import ranking" ] }, { @@ -67,7 +66,7 @@ "source": [ "myranker = ranking.Ranker(\n", " method=\"deezymatch\", # Here we're telling the ranker to use DeezyMatch.\n", - " resources_path=\"../resources/wikidata/\", # Here, the path to the Wikidata resources.\n", + " resources_path=\"../resources/\", # Here, the path to the Wikidata resources.\n", " # Parameters to create the string pair dataset:\n", " strvar_parameters={\n", " \"overwrite_dataset\": False,\n", @@ -124,7 +123,7 @@ "outputs": [], "source": [ "# Train a DeezyMatch model if needed:\n", - "myranker.train()" + "myranker.mentions_to_wikidata = myranker.train()" ] }, { @@ -143,8 +142,26 @@ "source": [ "# Find candidates given a toponym:\n", "toponym = \"Manchefter\"\n", - "print(myranker.find_candidates([{\"mention\": toponym}])[0][toponym])" + "print(myranker.find_candidates([{\"mention\": toponym}])[toponym])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Find candidates given a toponym:\n", + "toponym = \"Londen\"\n", + "print(myranker.find_candidates([{\"mention\": toponym}])[toponym])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -163,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/train_use_deezy_model_3.ipynb b/examples/train_use_deezy_model_3.ipynb index 3b9a0c35..28aa9f78 100644 --- a/examples/train_use_deezy_model_3.ipynb +++ b/examples/train_use_deezy_model_3.ipynb @@ -49,8 +49,7 @@ "import sys\n", "from pathlib import Path\n", "\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import ranking" + "from t_res.geoparser import ranking" ] }, { @@ -69,7 +68,7 @@ "source": [ "myranker = ranking.Ranker(\n", " method=\"deezymatch\", # Here we're telling the ranker to use DeezyMatch.\n", - " resources_path=\"../resources/wikidata/\", # Here, the path to the Wikidata resources.\n", + " resources_path=\"../resources/\", # Here, the path to the Wikidata resources.\n", " # Parameters to create the string pair dataset:\n", " strvar_parameters={\n", " \"overwrite_dataset\": False,\n", @@ -127,7 +126,7 @@ "source": [ "# Find candidates given a toponym:\n", "toponym = \"Ashton-cnderLyne\"\n", - "print(myranker.find_candidates([{\"mention\": toponym}])[0][toponym])" + "print(myranker.find_candidates([{\"mention\": toponym}])[toponym])" ] }, { @@ -138,8 +137,15 @@ "source": [ "# Find candidates given a toponym:\n", "toponym = \"Shefiield\"\n", - "print(myranker.find_candidates([{\"mention\": toponym}])[0][toponym])" + "print(myranker.find_candidates([{\"mention\": toponym}])[toponym])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -158,7 +164,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/examples/train_use_ner_model.ipynb b/examples/train_use_ner_model.ipynb index c0e3542e..aa54766f 100644 --- a/examples/train_use_ner_model.ipynb +++ b/examples/train_use_ner_model.ipynb @@ -21,8 +21,7 @@ "import os\n", "import sys\n", "\n", - "sys.path.insert(0, os.path.abspath(os.path.pardir))\n", - "from geoparser import recogniser" + "from t_res.geoparser import recogniser" ] }, { @@ -138,6 +137,13 @@ "predictions = myner.ner_predict(sentence)\n", "print([pred for pred in predictions if pred[\"entity\"] != \"O\"]) # Note that, if you've trained the model in the test mode, the model will probably not identify \"Sheffield\" as a location." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -156,7 +162,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.17" }, "orig_nbformat": 4 }, diff --git a/experiments/README.md b/experiments/README.md index 1195cc03..bc7a3111 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -20,16 +20,22 @@ You will also need the [word2vec embeddings](TODO: add link) trained from 19th C To create the datasets that we use in the experiments presented in the paper, run the following command: ```bash -python prepare_data.py +python prepare_data.py -p ../resources ``` + +> **_NOTE:_** Use the ``-p`` flag to indicate the path to your resources directory. + This script takes care of downloading the LwM and HIPE datasets and format them as needed in the experiments. ### 3. Running the experiments To run the experiments, run the following script: ```bash -python toponym_resolution.py +python toponym_resolution.py -p ../resources ``` + +> **_NOTE:_** Use the ``-p`` flag to indicate the path to your resources directory. + This script does runs for all different scenarios reported in the experiments in the paper. ### 4. Evaluate diff --git a/geoparser/__init__.py b/experiments/__init__.py similarity index 100% rename from geoparser/__init__.py rename to experiments/__init__.py diff --git a/experiments/experiment.py b/experiments/experiment.py index 2e758586..1ab1412d 100644 --- a/experiments/experiment.py +++ b/experiments/experiment.py @@ -7,9 +7,8 @@ import pandas as pd from tqdm import tqdm -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from geoparser import linking, ranking, recogniser -from utils import process_data, rel_utils +from t_res.geoparser import linking, ranking, recogniser +from t_res.utils import process_data, rel_utils class Experiment: @@ -467,10 +466,12 @@ def create_mentions_df(self) -> pd.DataFrame: data=rows, ) + print(f"Saving to {os.path.join(self.data_path,self.dataset,f'{self.myner.model}_{cand_approach}')}") output_path = ( - self.data_path + self.dataset + "/" + self.myner.model + "_" + cand_approach + os.path.join(self.data_path,self.dataset,f"{self.myner.model}_{cand_approach}") ) + # List of columns to merge (i.e. columns where we have indicated # out data splits), and "article_id", the columns on which we # will merge the data: @@ -808,6 +809,6 @@ def linking_experiments(self) -> None: # ----------------------------------------------- # Run end-to-end REL experiments: if self.rel_experiments == True: - from utils import rel_e2e + from t_res.utils import rel_e2e rel_e2e.run_rel_experiments(self) diff --git a/experiments/prepare_data.py b/experiments/prepare_data.py index c84cbe83..89ff300a 100644 --- a/experiments/prepare_data.py +++ b/experiments/prepare_data.py @@ -1,24 +1,36 @@ -import os -import sys - -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) import json import os import random +import sys +from argparse import ArgumentParser from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split -from utils import get_data, preprocess_data +from t_res.utils import get_data, preprocess_data RANDOM_SEED = 42 random.seed(RANDOM_SEED) -resources = "../resources/" # path to resources -output_path_lwm = "../experiments/outputs/data/lwm/" -output_path_hipe = "../experiments/outputs/data/hipe/" +parser = ArgumentParser() +parser.add_argument( + "-p", + "--path", + dest="path", + help="path to resources directory", + action="store", + type=str, +) + +args = parser.parse_args() + +resources_dir = args.path + +current_dir = Path(__file__).parent.resolve() +output_path_lwm = os.path.join(current_dir, "outputs/data/lwm/") +output_path_hipe = os.path.join(current_dir, "outputs/data/hipe/") + # Create output folders for processed data if they do not exist: Path(output_path_lwm).mkdir(parents=True, exist_ok=True) Path(output_path_hipe).mkdir(parents=True, exist_ok=True) @@ -32,7 +44,7 @@ gazetteer_ids = set( list( pd.read_csv( - os.path.join(resources, "wikidata", "wikidata_gazetteer.csv"), + os.path.join(resources_dir, "wikidata", "wikidata_gazetteer.csv"), low_memory=False, )["wikidata_id"].unique() ) @@ -44,7 +56,7 @@ # ------------------------------------------------------ # Load publication metadata -with open(os.path.join(f"{resources}", "publication_metadata.json")) as jsonfile: +with open(os.path.join(resources_dir, "publication_metadata.json")) as jsonfile: df_metadata = json.load(jsonfile) dict_titles = {k: df_metadata[k]["publication_title"] for k in df_metadata} @@ -57,20 +69,18 @@ # ------------------------------------------------------ # Path of the manually annotated data: -news_path = os.path.join(f"{resources}", "news_datasets") +news_path = os.path.join(resources_dir, "news_datasets") # Download the annotated data from the BL repository: get_data.download_lwm_data(news_path) # Training data from the manually annotated data: topres_path_train = os.path.join( - f"{resources}", "news_datasets", "topRes19th_v2", "train" + resources_dir, "news_datasets", "topRes19th_v2", "train" ) # Test data from the manually annotated data: -topres_path_test = os.path.join( - f"{resources}", "news_datasets", "topRes19th_v2", "test" -) +topres_path_test = os.path.join(resources_dir, "news_datasets", "topRes19th_v2", "test") # Process data for training a named entity recognition model: lwm_df = preprocess_data.process_lwm_for_ner(topres_path_train) @@ -104,8 +114,12 @@ ) # Process data for the resolution experiments: -lwm_train_df = preprocess_data.process_lwm_for_linking(topres_path_train, gazetteer_ids) -lwm_test_df = preprocess_data.process_lwm_for_linking(topres_path_test, gazetteer_ids) +lwm_train_df = preprocess_data.process_lwm_for_linking( + resources_dir, topres_path_train, gazetteer_ids +) +lwm_test_df = preprocess_data.process_lwm_for_linking( + resources_dir, topres_path_test, gazetteer_ids +) # Split train set into train and dev set, by article: lwm_train_df, lwm_dev_df = train_test_split( diff --git a/experiments/toponym_resolution.py b/experiments/toponym_resolution.py index 0fb55bd8..dee4af56 100644 --- a/experiments/toponym_resolution.py +++ b/experiments/toponym_resolution.py @@ -1,14 +1,28 @@ import os import sqlite3 import sys +from argparse import ArgumentParser from pathlib import Path +import experiment import pandas as pd -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from experiments import experiment -from geoparser import linking, ranking, recogniser +from t_res.geoparser import linking, ranking, recogniser + +parser = ArgumentParser() +parser.add_argument( + "-p", + "--path", + dest="path", + help="path to resources directory", + action="store", + type=str, +) + +args = parser.parse_args() + +resources_dir = args.path +current_dir = Path(__file__).parent.resolve() # Choose test scenario: # * "dev" while developing and experimenting, @@ -52,10 +66,12 @@ # Instantiate the recogniser: myner = recogniser.Recogniser( model="blb_lwm-ner-" + granularity, - train_dataset="../experiments/outputs/data/lwm/ner_" + train_dataset=str(current_dir) + + "/outputs/data/lwm/ner_" + granularity + "_train.json", # Path to the json file containing the training set (see note above). - test_dataset="../experiments/outputs/data/lwm/ner_" + test_dataset=str(current_dir) + + "/outputs/data/lwm/ner_" + granularity + "_dev.json", # Path to the json file containing the test set (see note above). pipe=None, # We'll store the NER pipeline here, leave this empty. @@ -65,7 +81,9 @@ # https://huggingface.co/Livingwithmachines/bert_1760_1900). You can # chose any other model from the HuggingFace hub, as long as it's # trained on the "Fill-Mask" objective (filter by task). - model_path="../resources/models/", # Path where the NER model will be stored. + model_path=os.path.join( + resources_dir, "models/" + ), # Path where the NER model will be stored. training_args={ "batch_size": 8, "num_train_epochs": 10, @@ -81,7 +99,7 @@ # Instantiate the ranker: myranker = ranking.Ranker( method=cand_select_method, - resources_path="../resources/wikidata/", + resources_path=resources_dir, mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -90,13 +108,13 @@ "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("../resources/models/w2v/").resolve()), + "w2v_ocr_path": os.path.join(resources_dir, "models/w2v/"), "w2v_ocr_model": "w2v_*_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("../resources/deezymatch/").resolve()), + "dm_path": os.path.join(resources_dir, "deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -113,15 +131,17 @@ # -------------------------------------- # Instantiate the linker: - with sqlite3.connect("../resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect( + os.path.join(resources_dir, "rel_db/embeddings_database.db") + ) as conn: cursor = conn.cursor() mylinker = linking.Linker( method=top_res_method, - resources_path="../resources/", + resources_path=resources_dir, linking_resources=dict(), rel_params={ - "model_path": "../resources/models/disambiguation/", - "data_path": "../experiments/outputs/data/lwm/", + "model_path": os.path.join(resources_dir, "models/disambiguation/"), + "data_path": os.path.join(current_dir, "outputs/data/lwm/"), "training_split": "", "db_embeddings": cursor, "with_publication": wpubl, @@ -137,9 +157,9 @@ # Instantiate the experiment: myexperiment = experiment.Experiment( dataset=dataset, - data_path="outputs/data/", + data_path=os.path.join(current_dir, "outputs/data/"), dataset_df=pd.DataFrame(), - results_path="outputs/results/", + results_path=os.path.join(current_dir, "outputs/results/"), myner=myner, myranker=myranker, mylinker=mylinker, diff --git a/pyproject.toml b/pyproject.toml index 20fd53e8..e1dd0e53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,11 @@ [tool.poetry] -name = "lwm_t_res" +name = "t_res" version = "0.1.0" description = "" authors = ["Federico Nanni "] [tool.poetry.dependencies] -python = "^3.9" +python = ">=3.9, <4.0" tqdm = "^4.62.3" bs4 = "^0.0.1" pandas = "^1.3.4" @@ -34,20 +34,24 @@ fastapi = "^0.87.0" uvicorn = {extras = ["standard"], version = "^0.20.0"} ipykernel = "^6.21.3" python-levenshtein = "^0.20.9" -Sphinx = { version = "4.2.0", optional = true } -sphinx-rtd-theme = { version = "1.0.0", optional = true } -sphinxcontrib-napoleon = { version = "0.7", optional = true } -torch = "1.13.1" -accelerate = "^0.21.0" +torch = "^1.13.1" +accelerate = "^0.27.2" +scipy = "<=1.11.0" -[tool.poetry.dev-dependencies] -pytest = "^5.2" +[tool.poetry.group.dev.dependencies] +pytest = "^7" jupyter = "^1.0.0" black = "^22.3.0" flake8 = "^6.0.0" isort = "^5.12.0" pre-commit = "^3.3.1" +[tool.poetry.group.docs.dependencies] +Sphinx = "^5.0.0" +sphinx-rtd-theme = "^1.0.0" +sphinxcontrib-napoleon = "^0.7" +sphinx-copybutton = "^0.5.2" + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" @@ -59,5 +63,7 @@ include = '\.pyi?$' [tool.isort] profile = "black" -[tool.poetry.extras] -docs = ["Sphinx", "sphinx-rtd-theme", "sphinxcontrib-napoleon"] +[tool.pytest.ini_options] +markers = [ + "deezy: tests which need a deezy model", +] diff --git a/utils/REL/__init__.py b/t_res/__init__.py similarity index 100% rename from utils/REL/__init__.py rename to t_res/__init__.py diff --git a/utils/__init__.py b/t_res/geoparser/__init__.py similarity index 100% rename from utils/__init__.py rename to t_res/geoparser/__init__.py diff --git a/geoparser/linking.py b/t_res/geoparser/linking.py similarity index 90% rename from geoparser/linking.py rename to t_res/geoparser/linking.py index 6295f5f7..bb21749b 100644 --- a/geoparser/linking.py +++ b/t_res/geoparser/linking.py @@ -14,12 +14,9 @@ RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) - -from geoparser import ranking -from utils import rel_utils -from utils.REL import entity_disambiguation +from ..utils import rel_utils +from ..utils.REL import entity_disambiguation +from . import ranking class Linker: @@ -31,7 +28,9 @@ class Linker: Arguments: method (Literal["mostpopular", "reldisamb", "bydistance"]): The linking method to use. - resources_path (str, optional): The path to the linking resources. + resources_path (str): The path to the linking resources. + experiments_path (str, optional): The path to the experiments + directory. Default is "../experiments/". linking_resources (dict, optional): Dictionary containing the necessary linking resources. Defaults to ``dict()`` (an empty dictionary). @@ -48,7 +47,8 @@ class Linker: linker = Linker( method="mostpopular", - resources_path="/path/to/linking/resources/", + resources_path="/path/to/resources/", + experiments_path="/path/to/experiments/", linking_resources={}, overwrite_training=True, rel_params={"with_publication": True, "do_test": True} @@ -68,6 +68,7 @@ class Linker: mylinker = linking.Linker( method="reldisamb", resources_path="../resources/", + experiments_path="../experiments/", linking_resources=dict(), rel_params={ "model_path": "../resources/models/disambiguation/", @@ -107,28 +108,36 @@ def __init__( self, method: Literal["mostpopular", "reldisamb", "bydistance"], resources_path: str, + experiments_path: Optional[str] = "../experiments", linking_resources: Optional[dict] = dict(), overwrite_training: Optional[bool] = False, - rel_params: Optional[dict] = { - "model_path": "../resources/models/disambiguation/", - "data_path": "../experiments/outputs/data/lwm/", - "training_split": "originalsplit", - "db_embeddings": None, # The cursor to the embeddings database. - "with_publication": True, - "without_microtoponyms": True, - "do_test": False, - "default_publname": "United Kingdom", - "default_publwqid": "Q145", - }, + rel_params: Optional[dict] = None, + rel_device: Optional[str] = None, ): """ Initialises a Linker object. """ self.method = method self.resources_path = resources_path + self.experiments_path = experiments_path self.linking_resources = linking_resources self.overwrite_training = overwrite_training + + if rel_params is None: + rel_params = { + "model_path": os.path.join(resources_path, "models/disambiguation/"), + "data_path": os.path.join(experiments_path, "outputs/data/lwm/"), + "training_split": "originalsplit", + "db_embeddings": None, # The cursor to the embeddings database. + "with_publication": True, + "without_microtoponyms": True, + "do_test": False, + "default_publname": "United Kingdom", + "default_publwqid": "Q145", + } + self.rel_params = rel_params + self.rel_device = rel_device def __str__(self) -> str: """ @@ -156,12 +165,14 @@ def load_resources(self) -> dict: # Load Wikidata mentions-to-QID with absolute counts: print(" > Loading mentions to wikidata mapping.") - with open(self.resources_path + "wikidata/mentions_to_wikidata.json", "r") as f: + with open( + os.path.join(self.resources_path, "wikidata/mentions_to_wikidata.json"), "r" + ) as f: self.linking_resources["mentions_to_wikidata"] = json.load(f) print(" > Loading gazetteer.") gaz = pd.read_csv( - f"{self.resources_path}wikidata/wikidata_gazetteer.csv", + os.path.join(self.resources_path, "wikidata/wikidata_gazetteer.csv"), usecols=["wikidata_id", "latitude", "longitude"], ) gaz["latitude"] = gaz["latitude"].astype(float) @@ -177,7 +188,9 @@ def load_resources(self) -> dict: # The entity2class.txt file is created as the last step in # wikipedia processing: - with open(f"{self.resources_path}wikidata/entity2class.txt", "r") as f: + with open( + os.path.join(self.resources_path, "wikidata/entity2class.txt"), "r" + ) as f: self.linking_resources["entity2class"] = json.load(f) print("*** Linking resources loaded!\n") @@ -444,6 +457,8 @@ def train_load_model( "mode": "train", "model_path": os.path.join(linker_name, "model"), } + if self.rel_device is not None: + config_rel["device"] = self.rel_device # Instantiate the entity disambiguation model: model = entity_disambiguation.EntityDisambiguation( @@ -465,6 +480,8 @@ def train_load_model( "mode": "eval", "model_path": os.path.join(linker_name, "model"), } + if self.rel_device is not None: + config_rel["device"] = self.rel_device model = entity_disambiguation.EntityDisambiguation( self.rel_params["db_embeddings"], diff --git a/geoparser/pipeline.py b/t_res/geoparser/pipeline.py similarity index 96% rename from geoparser/pipeline.py rename to t_res/geoparser/pipeline.py index dc0d095d..2d3230cf 100644 --- a/geoparser/pipeline.py +++ b/t_res/geoparser/pipeline.py @@ -5,10 +5,8 @@ from sentence_splitter import split_text_into_sentences -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from geoparser import linking, ranking, recogniser -from utils import ner, rel_utils +from ..utils import ner, rel_utils +from . import linking, ranking, recogniser class Pipeline: @@ -28,6 +26,9 @@ class Pipeline: mylinker (linking.Linker, optional): The ``Linker`` object to use in the pipeline. If None, the default ``Linker`` will be instantiated. For the default settings, see Notes below. + resources_path (str, optional): The path to your resources directory. + experiments_path (str, optional): The path to the experiments directory. + Default is "../experiments". Example: >>> # Instantiate the Pipeline object with a default setup @@ -57,7 +58,7 @@ class Pipeline: ranking.Ranker( method="perfectmatch", - resources_path="../resources/wikidata/", + resources_path=resources_path, ) * The default settings for the ``Linker``: @@ -66,7 +67,7 @@ class Pipeline: linking.Linker( method="mostpopular", - resources_path="../resources/", + resources_path=resources_path, ) """ @@ -75,6 +76,9 @@ def __init__( myner: Optional[recogniser.Recogniser] = None, myranker: Optional[ranking.Ranker] = None, mylinker: Optional[linking.Linker] = None, + resources_path: Optional[str] = None, + experiments_path: Optional[str] = None, + ner_device: Optional[str] = None, ): """ Instantiates a Pipeline object. @@ -89,21 +93,34 @@ def __init__( self.myner = recogniser.Recogniser( model="Livingwithmachines/toponym-19thC-en", load_from_hub=True, + device=ner_device, ) # If myranker is None, instantiate the default Ranker. if not self.myranker: + if not resources_path: + raise ValueError("[ERROR] Please specify path to resources directory.") self.myranker = ranking.Ranker( method="perfectmatch", - resources_path="../resources/wikidata/", + resources_path=resources_path, ) # If mylinker is None, instantiate the default Linker. if not self.mylinker: - self.mylinker = linking.Linker( - method="mostpopular", - resources_path="../resources/", - ) + if not resources_path: + raise ValueError("[ERROR] Please specify path to resources directory.") + + if experiments_path: + self.mylinker = linking.Linker( + method="mostpopular", + resources_path=resources_path, + experiments_path=experiments_path, + ) + else: + self.mylinker = linking.Linker( + method="mostpopular", + resources_path=resources_path, + ) # ----------------------------------------- # NER training and creating pipeline: @@ -135,9 +152,6 @@ def __init__( self.myranker ) - # Check we've actually loaded the mentions2wikidata dictionary: - assert self.myranker.mentions_to_wikidata["London"] is not None - def run_sentence( self, sentence: str, diff --git a/geoparser/ranking.py b/t_res/geoparser/ranking.py similarity index 93% rename from geoparser/ranking.py rename to t_res/geoparser/ranking.py index 63bdd3b6..a314003c 100644 --- a/geoparser/ranking.py +++ b/t_res/geoparser/ranking.py @@ -9,9 +9,7 @@ from pandarallel import pandarallel from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from utils import deezy_processing +from ..utils import deezy_processing class Ranker: @@ -111,31 +109,8 @@ def __init__( resources_path: str, mentions_to_wikidata: Optional[dict] = dict(), wikidata_to_mentions: Optional[dict] = dict(), - strvar_parameters: Optional[dict] = { - # Parameters to create the string pair dataset: - "ocr_threshold": 60, - "top_threshold": 85, - "min_len": 5, - "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", - "overwrite_dataset": False, - }, - deezy_parameters: Optional[dict] = { - # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), - "dm_cands": "wkdtalts", - "dm_model": "w2v_ocr", - "dm_output": "deezymatch_on_the_fly", - # Ranking measures: - "ranking_metric": "faiss", - "selection_threshold": 50, - "num_candidates": 1, - "verbose": False, - # DeezyMatch training: - "overwrite_training": False, - "do_test": False, - }, + strvar_parameters: Optional[dict] = None, + deezy_parameters: Optional[dict] = None, already_collected_cands: Optional[dict] = dict(), ): """ @@ -145,6 +120,37 @@ def __init__( self.resources_path = resources_path self.mentions_to_wikidata = mentions_to_wikidata self.wikidata_to_mentions = wikidata_to_mentions + + # set paths based on resources path + if strvar_parameters is None: + strvar_parameters = { + # Parameters to create the string pair dataset: + "ocr_threshold": 60, + "top_threshold": 85, + "min_len": 5, + "max_len": 15, + "w2v_ocr_path": os.path.join(resources_path, "models/w2v/"), + "w2v_ocr_model": "w2v_*_news", + "overwrite_dataset": False, + } + + if deezy_parameters is None: + deezy_parameters = { + # Paths and filenames of DeezyMatch models and data: + "dm_path": os.path.join(resources_path, "deezymatch/"), + "dm_cands": "wkdtalts", + "dm_model": "w2v_ocr", + "dm_output": "deezymatch_on_the_fly", + # Ranking measures: + "ranking_metric": "faiss", + "selection_threshold": 50, + "num_candidates": 1, + "verbose": False, + # DeezyMatch training: + "overwrite_training": False, + "do_test": False, + } + self.strvar_parameters = strvar_parameters self.deezy_parameters = deezy_parameters self.already_collected_cands = already_collected_cands @@ -203,8 +209,12 @@ def load_resources(self) -> dict: # Load files files = { - "mentions_to_wikidata": f"{self.resources_path}mentions_to_wikidata_normalized.json", - "wikidata_to_mentions": f"{self.resources_path}wikidata_to_mentions_normalized.json", + "mentions_to_wikidata": os.path.join( + self.resources_path, "wikidata/mentions_to_wikidata_normalized.json" + ), + "wikidata_to_mentions": os.path.join( + self.resources_path, "wikidata/wikidata_to_mentions_normalized.json" + ), } with open(files["mentions_to_wikidata"], "r") as f: @@ -275,7 +285,9 @@ def train(self) -> None: if self.deezy_parameters["do_test"] == True: self.deezy_parameters["dm_model"] += "_test" self.deezy_parameters["dm_cands"] += "_test" - deezy_processing.train_deezy_model(self.deezy_parameters, self.strvar_parameters, self.wikidata_to_mentions) + deezy_processing.train_deezy_model( + self.deezy_parameters, self.strvar_parameters, self.wikidata_to_mentions + ) deezy_processing.generate_candidates( self.deezy_parameters, self.mentions_to_wikidata ) @@ -490,7 +502,7 @@ def deezy_on_the_fly(self, queries: List[str]) -> Tuple[dict, dict]: Example: >>> ranker = Ranker(...) - >>> ranker.mentions_to_wikidata = ranker.load_resources() + >>> ranker.load_resources() >>> queries = ['London', 'Shefrield'] >>> candidates, already_collected = ranker.deezy_on_the_fly(queries) >>> print(candidates) @@ -583,7 +595,7 @@ def run(self, queries: List[str]) -> Tuple[dict, dict]: Example: >>> myranker = Ranker(method="perfectmatch", ...) - >>> myranker.mentions_to_wikidata = myranker.load_resources() + >>> ranker.mentions_to_wikidata = myranker.load_resources() >>> queries = ['London', 'Barcelona', 'Bologna'] >>> candidates, already_collected = myranker.run(queries) >>> print(candidates) diff --git a/geoparser/recogniser.py b/t_res/geoparser/recogniser.py similarity index 96% rename from geoparser/recogniser.py rename to t_res/geoparser/recogniser.py index 54d24b2a..214a4f0a 100644 --- a/geoparser/recogniser.py +++ b/t_res/geoparser/recogniser.py @@ -16,9 +16,7 @@ pipeline, ) -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from utils import ner +from ..utils import ner class Recogniser: @@ -96,6 +94,7 @@ def __init__( overwrite_training: Optional[bool] = False, do_test: Optional[bool] = False, load_from_hub: Optional[bool] = False, + device: Optional[str] = None, ): """ Initialises a Recogniser object. @@ -110,6 +109,7 @@ def __init__( self.overwrite_training = overwrite_training self.do_test = do_test self.load_from_hub = load_from_hub + self.device = device # Add "_test" to the model name if do_test is True, unless # the model is downloaded from Huggingface, in which case @@ -167,7 +167,7 @@ def train(self) -> None: return None # If model exists and overwrite is set to False, skip training: - model_path = f"{self.model_path}{self.model}.model" + model_path = os.path.join(self.model_path,f"{self.model}.model") if Path(model_path).exists() and self.overwrite_training == False: s = "\n** Note: Model " s += f"{model_path} is already trained.\n" @@ -272,7 +272,7 @@ def compute_metrics(p: Tuple[list, list]) -> dict: training_args = TrainingArguments( output_dir=self.model_path, evaluation_strategy="epoch", - logging_dir=self.model_path + "runs/" + self.model, + logging_dir=os.path.join(self.model_path,"runs/",self.model), learning_rate=self.training_args["learning_rate"], per_device_train_batch_size=self.training_args["batch_size"], per_device_eval_batch_size=self.training_args["batch_size"], @@ -297,7 +297,7 @@ def compute_metrics(p: Tuple[list, list]) -> dict: trainer.evaluate() # Save the model: - trainer.save_model(self.model_path + self.model + ".model") + trainer.save_model(os.path.join(self.model_path,f"{self.model}.model")) # ------------------------------------------------------------- def create_pipeline(self) -> Pipeline: @@ -324,11 +324,11 @@ def create_pipeline(self) -> Pipeline: # If the model is local (has not been obtained from the hub), # pre-append the model path and the extension of the model # to obtain the model name. - if self.load_from_hub == False: - model_name = self.model_path + self.model + ".model" + if self.load_from_hub is False: + model_name = os.path.join(self.model_path, f"{self.model}.model") # Load a NER pipeline: - self.pipe = pipeline("ner", model=model_name, ignore_labels=[]) + self.pipe = pipeline("ner", model=model_name, ignore_labels=[], device=self.device) return self.pipe # ------------------------------------------------------------- diff --git a/t_res/utils/REL/__init__.py b/t_res/utils/REL/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/REL/entity_disambiguation.py b/t_res/utils/REL/entity_disambiguation.py similarity index 99% rename from utils/REL/entity_disambiguation.py rename to t_res/utils/REL/entity_disambiguation.py index 26a147de..68a1c2cf 100644 --- a/utils/REL/entity_disambiguation.py +++ b/t_res/utils/REL/entity_disambiguation.py @@ -14,11 +14,10 @@ from sklearn.linear_model import LogisticRegression from torch.autograd import Variable -sys.path.insert(0, os.path.abspath(os.path.pardir)) -import utils.REL.utils as utils -from utils import rel_utils -from utils.REL.mulrel_ranker import MulRelRanker, PreRank -from utils.REL.vocabulary import Vocabulary +from . import utils +from .. import rel_utils +from .mulrel_ranker import MulRelRanker, PreRank +from .vocabulary import Vocabulary RANDOM_SEED = 42 random.seed(RANDOM_SEED) @@ -68,7 +67,7 @@ def __init__(self, db_embs, user_config, reset_embeddings=False): self.config = self.__get_config(user_config) # Use CPU if cuda is not available: - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.device = self.config.get("device", "cuda" if torch.cuda.is_available() else "cpu") self.prerank_model = None self.model = None self.reset_embeddings = reset_embeddings @@ -162,7 +161,7 @@ def __get_config(self, user_config): } config = default_config - print("Model path:", config["model_path"], config["mode"]) + print("Model path:", os.path.abspath(config["model_path"]), config["mode"]) return config diff --git a/utils/REL/mulrel_ranker.py b/t_res/utils/REL/mulrel_ranker.py similarity index 100% rename from utils/REL/mulrel_ranker.py rename to t_res/utils/REL/mulrel_ranker.py diff --git a/utils/REL/utils.py b/t_res/utils/REL/utils.py similarity index 100% rename from utils/REL/utils.py rename to t_res/utils/REL/utils.py diff --git a/utils/REL/vocabulary.py b/t_res/utils/REL/vocabulary.py similarity index 100% rename from utils/REL/vocabulary.py rename to t_res/utils/REL/vocabulary.py diff --git a/t_res/utils/__init__.py b/t_res/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/deezy_processing.py b/t_res/utils/deezy_processing.py similarity index 100% rename from utils/deezy_processing.py rename to t_res/utils/deezy_processing.py diff --git a/utils/get_data.py b/t_res/utils/get_data.py similarity index 100% rename from utils/get_data.py rename to t_res/utils/get_data.py diff --git a/utils/ner.py b/t_res/utils/ner.py similarity index 100% rename from utils/ner.py rename to t_res/utils/ner.py diff --git a/utils/preprocess_data.py b/t_res/utils/preprocess_data.py similarity index 98% rename from utils/preprocess_data.py rename to t_res/utils/preprocess_data.py index 975df05c..d8f5c785 100644 --- a/utils/preprocess_data.py +++ b/t_res/utils/preprocess_data.py @@ -11,31 +11,30 @@ import pandas as pd -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from utils import process_wikipedia +from . import process_wikipedia -# Path to Wikipedia resources (where the wiki2wiki mapper is located): -path_to_wikipedia = "../resources/wikipedia/" - -def turn_wikipedia2wikidata(wikipedia_title: str) -> Optional[str]: +def turn_wikipedia2wikidata( + wikipedia_title: str, + wikipedia_path: str, +) -> Optional[str]: """ Convert a Wikipedia title to its corresponding Wikidata ID. Arguments: wikipedia_title (str): The title of the Wikipedia page. + wikipedia_path (str): The path to your wikipedia directory. Returns: Optional[str]: The corresponding Wikidata ID if available, or None if not. Example: - >>> turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Colosseum") + >>> turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Colosseum", "../resources") 'Q10285' - >>> turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Ancient_Egypt") + >>> turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Ancient_Egypt", "../resources") 'Q11768' - >>> turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Invalid_Location") + >>> turn_wikipedia2wikidata("https://en.wikipedia.org/wiki/Invalid_Location", "../resources") Warning: invalid_location is not in wikipedia2wikidata, the wkdt_qid will be None. """ if not wikipedia_title == "NIL" and not wikipedia_title == "*": @@ -46,7 +45,7 @@ def turn_wikipedia2wikidata(wikipedia_title: str) -> Optional[str]: ) linked_wqid = process_wikipedia.title_to_id( processed_wikipedia_title, - path_to_db=os.path.join(path_to_wikipedia, "index_enwiki-latest.db"), + path_to_db=os.path.join(wikipedia_path, "index_enwiki-latest.db"), lower=True, ) if not linked_wqid: @@ -234,7 +233,7 @@ def process_lwm_for_ner(tsv_topres_path: str): def process_lwm_for_linking( - tsv_topres_path: str, gazetteer_ids: List[str] + resources_dir: str, tsv_topres_path: str, gazetteer_ids: List[str] ) -> pd.DataFrame: """ Process LwM data for performing entity linking. @@ -243,6 +242,7 @@ def process_lwm_for_linking( Each row includes the annotation and resolution information of the toponym. Arguments: + resources_dir (str): The path to the resources directory tsv_topres_path (str): The path to the top-level directory containing the annotated TSV files. gazetteer_ids (list): The set of Wikidata IDs in the gazetteer. @@ -327,8 +327,9 @@ def process_lwm_for_linking( # Clean Wikidata URL: wkpd = wkpd.replace("\\", "") + wikipedia_path = os.path.join(resources_dir, "wikipedia/") # Get Wikidata ID: - wkdt = turn_wikipedia2wikidata(wkpd) + wkdt = turn_wikipedia2wikidata(wkpd, wikipedia_path) # In mentions attached to next token through a dash, # keep only the true mention (this has to do with diff --git a/utils/process_data.py b/t_res/utils/process_data.py similarity index 99% rename from utils/process_data.py rename to t_res/utils/process_data.py index a7b80893..c8da8cd5 100644 --- a/utils/process_data.py +++ b/t_res/utils/process_data.py @@ -8,11 +8,10 @@ import pandas as pd from tqdm import tqdm -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from utils import ner +from . import ner if TYPE_CHECKING: - from geoparser import recogniser + from ..geoparser import recogniser def eval_with_exception(str2parse: str, in_case: Optional[Any] = "") -> Any: diff --git a/utils/process_wikipedia.py b/t_res/utils/process_wikipedia.py similarity index 100% rename from utils/process_wikipedia.py rename to t_res/utils/process_wikipedia.py diff --git a/utils/rel_e2e.py b/t_res/utils/rel_e2e.py similarity index 95% rename from utils/rel_e2e.py rename to t_res/utils/rel_e2e.py index 33dcb190..1d0beca4 100644 --- a/utils/rel_e2e.py +++ b/t_res/utils/rel_e2e.py @@ -6,11 +6,11 @@ import requests from tqdm import tqdm -# Import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from utils import process_data, process_wikipedia -from experiments import experiment +from . import process_data, process_wikipedia +# Add "../../experiments/" to path to import experiments +sys.path.insert(0, os.path.abspath("../../experiments/")) +from experiments import experiment def rel_end_to_end(sent: str) -> dict: """ @@ -57,12 +57,16 @@ def get_rel_from_api(dSentences: dict, rel_end2end_path: str) -> None: rel_preds = json.load(f) -def match_wikipedia_to_wikidata(wiki_title: str) -> str: +def match_wikipedia_to_wikidata( + wiki_title: str, + path_to_db: str, + ) -> str: """ Retrieve the Wikidata ID corresponding to a Wikipedia title. Arguments: wiki_title (str): A Wikipedia title in underscore-separated format. + path_to_db (str): The path to your wikipedia database (e.g. "../resources/wikipedia/index_enwiki-latest.db"). Returns: str: @@ -72,7 +76,7 @@ def match_wikipedia_to_wikidata(wiki_title: str) -> str: wqid = process_wikipedia.title_to_id( wiki_title, lower=False, - path_to_db="../resources/wikipedia/index_enwiki-latest.db", + path_to_db=path_to_db, ) if not wqid: wqid = "NIL" diff --git a/utils/rel_utils.py b/t_res/utils/rel_utils.py similarity index 98% rename from utils/rel_utils.py rename to t_res/utils/rel_utils.py index 7589924f..6cc38bee 100644 --- a/utils/rel_utils.py +++ b/t_res/utils/rel_utils.py @@ -9,8 +9,7 @@ import numpy as np import pandas as pd -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from geoparser import ranking +from ..geoparser import ranking RANDOM_SEED = 42 """Constant representing the random seed used for generating pseudo-random @@ -323,7 +322,7 @@ def prepare_rel_trainset( # Format the mentions are required by the ranker: all_mentions = [{"mention": mention} for mention in all_mentions] # Use the ranker to find candidates: - wk_cands, myranker.already_collected_cands = myranker.find_candidates(all_mentions) + wk_cands = myranker.find_candidates(all_mentions) # Rank the candidates: rel_json = rank_candidates( rel_json, diff --git a/tests/sample_files/experiments/outputs/data/lwm/linking_df_split.tsv b/tests/sample_files/experiments/outputs/data/lwm/linking_df_split.tsv new file mode 100644 index 00000000..f066ee1d --- /dev/null +++ b/tests/sample_files/experiments/outputs/data/lwm/linking_df_split.tsv @@ -0,0 +1,15 @@ +article_id sentences annotations place decade year ocr_quality_mean ocr_quality_sd publication_title publication_code place_wqid originalsplit apply withouttest Ashton1860 Dorchester1820 Dorchester1830 Dorchester1860 Manchester1780 Manchester1800 Manchester1820 Manchester1830 Manchester1860 Poole1860 +12670 [{'sentence_pos': 1, 'sentence_text': 'NOTICE. '}, {'sentence_pos': 2, 'sentence_text': 'THE STAR, Political, Naval, Military, I Literary, and Commercial Intelligencer, and General Advertiser. '}, {'sentence_pos': 3, 'sentence_text': 'Established in 1813.'}, {'sentence_pos': 4, 'sentence_text': '—This paper which is published every Tuesday, Thursday, and Saturday evening, contains, in addition to ectracts from the British and Foreign newspapers of articles of political and general intelligence, copious and carefully-selected information as to all naval, military, and commercial affairs, and especially that which relates to the trade and commerce of the Channel Islands. '}, {'sentence_pos': 5, 'sentence_text': 'The following are the subscription prices to The Star, payable quarterly :-For three numbers per week For two do. do. '}, {'sentence_pos': 6, 'sentence_text': 'For one do. do. 2s. do. '}, {'sentence_pos': 7, 'sentence_text': 'Single numbers, 2d. each. '}, {'sentence_pos': 8, 'sentence_text': 'Subscribers in the United Kingdom will be charged on the following scale, in British money, including pre-payment and posting, payable in advance : For three numbers per week....9s. 9d. per quarter For two do. do. ....Bs. '}, {'sentence_pos': 9, 'sentence_text': 'Bd. do. '}, {'sentence_pos': 10, 'sentence_text': 'For one do. do. ....3s. 4d. do. '}, {'sentence_pos': 11, 'sentence_text': 'The Star (or three numbers folded together) can be forwarded by post, to any part of the United Kingdom or France, on allicing a postage stamp of one PENNI: sir Address—No. 10, Bordage-Street, Guernsey.'}] [{'mention_pos': 0, 'mention': 'Channel Islands', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Channel_Islands', 'wkdt_qid': 'Q42314', 'mention_start': 364, 'mention_end': 379, 'sent_pos': 4}, {'mention_pos': 1, 'mention': 'United Kingdom', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/United_Kingdom', 'wkdt_qid': 'Q145', 'mention_start': 19, 'mention_end': 33, 'sent_pos': 8}, {'mention_pos': 2, 'mention': 'Bordage-Street', 'entity_type': 'STREET', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 177, 'mention_end': 191, 'sent_pos': 11}, {'mention_pos': 3, 'mention': 'Guernsey', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Guernsey', 'wkdt_qid': 'Q3311985', 'mention_start': 193, 'mention_end': 201, 'sent_pos': 11}] Poole 1860 1860 0.8953 0.1361 The Poole and South-Western Herald, etc. 2325 Q203349 dev train test train train train train train train train dev train test +8189322 [{'sentence_pos': 1, 'sentence_text': 'The earl of Stamford and WARRINGTON’S RENTS. '}, {'sentence_pos': 2, 'sentence_text': 'NOTICE IS HEREBY GIVEN, that an AUDIT for the Receipt of the RESERVED and CHIEF RENTS due to the Earl of Stamford and Warrington at Lady Day last for the Manor and Parish of Ashtonnnder-Lyne, in the county of Lancaster, will be holden at the Pitt and Nelson Inn, in Ashton-under-Lyne, on Monday, Tuesday, and Wednesday, the 4th, sth, and 6th days of May next, between the hours of Nine and Pour o’clock each day. '}, {'sentence_pos': 3, 'sentence_text': 'And that an AUDIT for the RESERVED and CHIEF RENTS for the Manor of Stayley, in the county of Chester, will be holden at the Eagle Inn, in Stalybridge, on Thursday, the 7th day of May next, between the hours of Eleven and Two o clock, on which days the tenants are requested to pay their rents. '}, {'sentence_pos': 4, 'sentence_text': 'The Court Leet and View of Frank Pledge of the said Earl, for the Manor of Ashton-under-Lyne, will be held at the Old Court House there, on Wednesday, the 6th day of May next, at Ten o’clock in the forenoon; and the Court Baron for the Manor of Stayley, on Thursday, the 7th day of May next, at One o’clock in the afternoon. '}, {'sentence_pos': 5, 'sentence_text': 'ARTHUR FREDERICK PAYNE, Agent to the said Earl. '}, {'sentence_pos': 6, 'sentence_text': 'Ashton-under-Lyne, April 23rd, 1863.'}] [{'mention_pos': 0, 'mention': 'Ashtonnnder-Lyne', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 174, 'mention_end': 190, 'sent_pos': 2}, {'mention_pos': 1, 'mention': 'county of Lancaster', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Lancashire', 'wkdt_qid': 'Q23077', 'mention_start': 199, 'mention_end': 218, 'sent_pos': 2}, {'mention_pos': 2, 'mention': 'Pitt and Nelson Inn', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 242, 'mention_end': 261, 'sent_pos': 2}, {'mention_pos': 3, 'mention': 'Ashton-under-Lyne', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 266, 'mention_end': 283, 'sent_pos': 2}, {'mention_pos': 4, 'mention': 'Manor of Stayley', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Stalybridge', 'wkdt_qid': 'Q1398653', 'mention_start': 59, 'mention_end': 75, 'sent_pos': 3}, {'mention_pos': 5, 'mention': 'county of Chester', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Cheshire', 'wkdt_qid': 'Q23064', 'mention_start': 84, 'mention_end': 101, 'sent_pos': 3}, {'mention_pos': 6, 'mention': 'Eagle Inn', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 125, 'mention_end': 134, 'sent_pos': 3}, {'mention_pos': 7, 'mention': 'Stalybridge', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Stalybridge', 'wkdt_qid': 'Q1398653', 'mention_start': 139, 'mention_end': 150, 'sent_pos': 3}, {'mention_pos': 8, 'mention': 'Ashton-under-Lyne', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 75, 'mention_end': 92, 'sent_pos': 4}, {'mention_pos': 9, 'mention': 'Old Court House', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 114, 'mention_end': 129, 'sent_pos': 4}, {'mention_pos': 10, 'mention': 'Manor of Stayley', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Stalybridge', 'wkdt_qid': 'Q1398653', 'mention_start': 236, 'mention_end': 252, 'sent_pos': 4}, {'mention_pos': 11, 'mention': 'Ashton-under-Lyne', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 0, 'mention_end': 17, 'sent_pos': 6}] Ashton-under-Lyne 1860 1863 0.8837 0.1619 Ashton and Stalybridge Reporter, etc. 967 Q659803 test dev left_out test dev train train train train train train train train +3938653 [{'sentence_pos': 1, 'sentence_text': 'THE COURT AND GOVERNMENT. '}, {'sentence_pos': 2, 'sentence_text': 'OSEORNE, THURSDA . '}, {'sentence_pos': 3, 'sentence_text': 'The Qaeen, accompanied by Princess Helena and Prince Albert Victor, drove oat in the grounds yesterday morning Princess Helena rode on horseback in the afternoon, attended by the Hon. '}, {'sentence_pos': 4, 'sentence_text': 'Emma. '}, {'sentence_pos': 5, 'sentence_text': 'Lascelles. '}, {'sentence_pos': 6, 'sentence_text': 'THE PRINCE AND PRINCESS OF WALES. '}, {'sentence_pos': 7, 'sentence_text': 'Plymouth, Thursday Evenhji;. '}, {'sentence_pos': 8, 'sentence_text': 'A dejeuner was given to-day at Mount Edgecumbe to about 100 of the elite of the neighbourhood, invited to meet the Prince and Princess of Wales. '}, {'sentence_pos': 9, 'sentence_text': 'In the afternoon their Royal Highnesses went a short yachting excursion in the Earl of Edgecunibes yacht; and in the evening the Prince dined with Viscount Templetown, Commander-inChief of the Western District, at the Government House, Mo ant Wise. '}, {'sentence_pos': 10, 'sentence_text': 'The party consisted principally of the army and navy officers in commission at the port, andthe officers •f the French squadron lying in the Sound. '}, {'sentence_pos': 11, 'sentence_text': 'After dinner the Prince adjourned with the company to a grand ball given by the united services at the Royal William Yard, Stonehouse. '}, {'sentence_pos': 12, 'sentence_text': 'Mr. '}, {'sentence_pos': 13, 'sentence_text': 'Frederick Peel ia in improved health, and has been gradually getting better since Tuesday. '}, {'sentence_pos': 14, 'sentence_text': 'The right hon. gentleman is still confined to his chamber.'}] [{'mention_pos': 0, 'mention': 'OSEORNE', 'entity_type': 'LOC', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 0, 'mention_end': 7, 'sent_pos': 2}, {'mention_pos': 1, 'mention': 'Plymouth', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Plymouth', 'wkdt_qid': 'Q43382', 'mention_start': 0, 'mention_end': 8, 'sent_pos': 7}, {'mention_pos': 2, 'mention': 'Mount Edgecumbe', 'entity_type': 'BUILDING', 'wkpd_url': 'https://en.wikipedia.org/wiki/Mount_Edgcumbe_House', 'wkdt_qid': 'Q6920546', 'mention_start': 31, 'mention_end': 46, 'sent_pos': 8}, {'mention_pos': 3, 'mention': 'Government House', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 218, 'mention_end': 234, 'sent_pos': 9}, {'mention_pos': 4, 'mention': 'Mo ant Wise', 'entity_type': 'BUILDING', 'wkpd_url': 'https://en.wikipedia.org/wiki/Mount_Wise,_Plymouth', 'wkdt_qid': 'Q14912878', 'mention_start': 236, 'mention_end': 247, 'sent_pos': 9}, {'mention_pos': 5, 'mention': 'Sound', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Plymouth_Sound', 'wkdt_qid': 'Q2333061', 'mention_start': 141, 'mention_end': 146, 'sent_pos': 10}, {'mention_pos': 6, 'mention': 'Royal William Yard', 'entity_type': 'BUILDING', 'wkpd_url': 'https://en.wikipedia.org/wiki/Royal_William_Victualling_Yard', 'wkdt_qid': 'Q7375014', 'mention_start': 103, 'mention_end': 121, 'sent_pos': 11}, {'mention_pos': 7, 'mention': 'Stonehouse', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Stonehouse,_Plymouth', 'wkdt_qid': 'Q7619235', 'mention_start': 123, 'mention_end': 133, 'sent_pos': 11}] Manchester 1860 1865 0.821 0.2563 Manchester Courier and Lancashire General Advertiser. 206 Q18125 train train dev train train train train train train train train test train +4938614 [{'sentence_pos': 1, 'sentence_text': 'DUKINFIELD. '}, {'sentence_pos': 2, 'sentence_text': 'Knutsford Sessions.'}, {'sentence_pos': 3, 'sentence_text': '—The servant girl, Eliza Ann Byrom, who stole a quantity of clothes from the house where she lodged, in Dukiafield, was sentenced to two months’ imprisonment. '}, {'sentence_pos': 4, 'sentence_text': 'Martha Wilde, who was sent from the Dukinfield court for obtaining money under false pretences by representing at two pawnshops in Dukinfield that a spurious composition called coraline beads were real coral was discharged. '}, {'sentence_pos': 5, 'sentence_text': 'Mr. '}, {'sentence_pos': 6, 'sentence_text': 'Brandt appeared for the prisoner. '}, {'sentence_pos': 7, 'sentence_text': 'Accident in a Factory.'}, {'sentence_pos': 8, 'sentence_text': '—Whilst a boy named Edwin _ Diggle, 14 years of age, a pieoer at Mr. Chadwick’s factory'}, {'sentence_pos': 9, 'sentence_text': '. '}, {'sentence_pos': 10, 'sentence_text': 'Tame Valley, was engaged, on Wednesday, in cleaning some jennies in one of the rooms he met with a severe accident to his head._ The spinner, who is rather deaf, not knowing that the lad was amongst the machinery, set the jennies going, and being unable to hear the shouts of the lad on account of the defect in his hearing, did not stop them until another man made signs to him to stop, which he did instantly. '}, {'sentence_pos': 11, 'sentence_text': 'The poor lad had, however, been severely hurt on the head, although no limbs were injured. '}, {'sentence_pos': 12, 'sentence_text': 'He was immediately conveyed in a cab to the Infirmary, where we understand he is progressing favourably.'}] [{'mention_pos': 0, 'mention': 'DUKINFIELD', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Dukinfield', 'wkdt_qid': 'Q1976179', 'mention_start': 0, 'mention_end': 10, 'sent_pos': 1}, {'mention_pos': 1, 'mention': 'Knutsford', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Knutsford', 'wkdt_qid': 'Q1470791', 'mention_start': 0, 'mention_end': 9, 'sent_pos': 2}, {'mention_pos': 2, 'mention': 'Dukiafield', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Dukinfield', 'wkdt_qid': 'Q1976179', 'mention_start': 104, 'mention_end': 114, 'sent_pos': 3}, {'mention_pos': 3, 'mention': 'Dukinfield', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Dukinfield', 'wkdt_qid': 'Q1976179', 'mention_start': 36, 'mention_end': 46, 'sent_pos': 4}, {'mention_pos': 4, 'mention': 'Dukinfield', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Dukinfield', 'wkdt_qid': 'Q1976179', 'mention_start': 131, 'mention_end': 141, 'sent_pos': 4}, {'mention_pos': 5, 'mention': 'Mr. Chadwick’s factory', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 65, 'mention_end': 87, 'sent_pos': 8}] Ashton-under-Lyne 1860 1869 0.9047 0.1623 The Ashton Reporter. 968 Q659803 train train dev test dev train train train train train train train train +8189322 [{'sentence_pos': 1, 'sentence_text': 'The earl of Stamford and WARRINGTON’S RENTS. '}, {'sentence_pos': 2, 'sentence_text': 'NOTICE IS HEREBY GIVEN, that an AUDIT for the Receipt of the RESERVED and CHIEF RENTS due to the Earl of Stamford and Warrington at Lady Day last for the Manor and Parish of Ashtonnnder-Lyne, in the county of Lancaster, will be holden at the Pitt and Nelson Inn, in Ashton-under-Lyne, on Monday, Tuesday, and Wednesday, the 4th, sth, and 6th days of May next, between the hours of Nine and Pour o’clock each day. '}, {'sentence_pos': 3, 'sentence_text': 'And that an AUDIT for the RESERVED and CHIEF RENTS for the Manor of Stayley, in the county of Chester, will be holden at the Eagle Inn, in Stalybridge, on Thursday, the 7th day of May next, between the hours of Eleven and Two o clock, on which days the tenants are requested to pay their rents. '}, {'sentence_pos': 4, 'sentence_text': 'The Court Leet and View of Frank Pledge of the said Earl, for the Manor of Ashton-under-Lyne, will be held at the Old Court House there, on Wednesday, the 6th day of May next, at Ten o’clock in the forenoon; and the Court Baron for the Manor of Stayley, on Thursday, the 7th day of May next, at One o’clock in the afternoon. '}, {'sentence_pos': 5, 'sentence_text': 'ARTHUR FREDERICK PAYNE, Agent to the said Earl. '}, {'sentence_pos': 6, 'sentence_text': 'Ashton-under-Lyne, April 23rd, 1863.'}] [{'mention_pos': 0, 'mention': 'Ashtonnnder-Lyne', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 174, 'mention_end': 190, 'sent_pos': 2}, {'mention_pos': 1, 'mention': 'county of Lancaster', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Lancashire', 'wkdt_qid': 'Q23077', 'mention_start': 199, 'mention_end': 218, 'sent_pos': 2}, {'mention_pos': 2, 'mention': 'Pitt and Nelson Inn', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 242, 'mention_end': 261, 'sent_pos': 2}, {'mention_pos': 3, 'mention': 'Ashton-under-Lyne', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 266, 'mention_end': 283, 'sent_pos': 2}, {'mention_pos': 4, 'mention': 'Manor of Stayley', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Stalybridge', 'wkdt_qid': 'Q1398653', 'mention_start': 59, 'mention_end': 75, 'sent_pos': 3}, {'mention_pos': 5, 'mention': 'county of Chester', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Cheshire', 'wkdt_qid': 'Q23064', 'mention_start': 84, 'mention_end': 101, 'sent_pos': 3}, {'mention_pos': 6, 'mention': 'Eagle Inn', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 125, 'mention_end': 134, 'sent_pos': 3}, {'mention_pos': 7, 'mention': 'Stalybridge', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Stalybridge', 'wkdt_qid': 'Q1398653', 'mention_start': 139, 'mention_end': 150, 'sent_pos': 3}, {'mention_pos': 8, 'mention': 'Ashton-under-Lyne', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 75, 'mention_end': 92, 'sent_pos': 4}, {'mention_pos': 9, 'mention': 'Old Court House', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 114, 'mention_end': 129, 'sent_pos': 4}, {'mention_pos': 10, 'mention': 'Manor of Stayley', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Stalybridge', 'wkdt_qid': 'Q1398653', 'mention_start': 236, 'mention_end': 252, 'sent_pos': 4}, {'mention_pos': 11, 'mention': 'Ashton-under-Lyne', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 0, 'mention_end': 17, 'sent_pos': 6}] Ashton-under-Lyne 1860 1863 0.8837 0.1619 Ashton and Stalybridge Reporter, etc. 967 Q659803 test dev left_out test dev train train train train train train train train +4939308 [{'sentence_pos': 1, 'sentence_text': 'THE STALYBRIDGE “EMPORIUM'}, {'sentence_pos': 2, 'sentence_text': '.'}, {'sentence_pos': 3, 'sentence_text': '” * 4 During the last year two shops have been erected in Melbourne-street by Mr. '}, {'sentence_pos': 4, 'sentence_text': 'Napoleon Ives, who give them the above name. '}, {'sentence_pos': 5, 'sentence_text': 'On Thursday evening the workmen employed thereon, and others, to the number of forty, sat dowu to a spread of geese, turkey, and other good things, at the house of Mr. '}, {'sentence_pos': 6, 'sentence_text': 'Bray, Dog and Partridge, Market-street. '}, {'sentence_pos': 7, 'sentence_text': 'Afterwards Mr. '}, {'sentence_pos': 8, 'sentence_text': 'Napoleon Ives, who paid for the dinner, occupied the chair, and Mr. '}, {'sentence_pos': 9, 'sentence_text': 'Haigh France the vice-chair. '}, {'sentence_pos': 10, 'sentence_text': 'The Chairman, in his opening remarks, said they had assembled in accordance with a good old custom, which brought together all who had been engaged in the erection of buildings. '}, {'sentence_pos': 11, 'sentence_text': 'It was one which had been almost extinguished, but he had always thought if ever it should be his fortune to erect any building, he would give a supper to the workmen engaged in its erection. '}, {'sentence_pos': 12, 'sentence_text': 'After complimenting the workmen on having done their duty to the two shops, he alluded to the dangers connected with the building trade from the excavator to the slater, and said that no buildings in town surpassed those which had led to that night’s gathering in drainage and other sanitory arrangements. '}, {'sentence_pos': 13, 'sentence_text': 'Every aperture was properly trapped, an important considei ation, seeing that the medical men of all large towns declared that the great causes of disease were impure water and unwholesome stenches. '}, {'sentence_pos': 14, 'sentence_text': 'Another important matter ip connection with houses was baths, without which no home could be considered complete. '}, {'sentence_pos': 15, 'sentence_text': 'He had, therefore, erected baths in each of the shops, which could be supplied with either hot or cold water. '}, {'sentence_pos': 16, 'sentence_text': 'In conclusion, he proposed the toast of “ The Queen,” and said he hoped she might live long to reign over them, and that her future might be as bright if not more brilliant than the past. '}, {'sentence_pos': 17, 'sentence_text': 'The toast was received with musical honours. '}, {'sentence_pos': 18, 'sentence_text': 'Mr. '}, {'sentence_pos': 19, 'sentence_text': 'Joseph Turner proposed “The Prince and Princess of Wales, and the rest of the royal family,” after which Mr. '}, {'sentence_pos': 20, 'sentence_text': 'Chabnock gave, with an appropriate address, “The Army, Navy, and Volunteers.” '}, {'sentence_pos': 21, 'sentence_text': 'He said Englishmen were proud of their army and navy, and often referred to their deeds in the most patriotic language. '}, {'sentence_pos': 22, 'sentence_text': 'He felt sure that if the services of the volunteers should be required, they would worthily emulate the deeds of old. '}, {'sentence_pos': 23, 'sentence_text': 'In conclusion, he gave a composition of his own, entitled “Defence, not defiance.” '}, {'sentence_pos': 24, 'sentence_text': 'The toast was well received, the Chairman speaking very highly of the volunteers of the country, and pointing out that, if properly managed, they would very much tend to a decrease in the annual military and naval expenditure of the country. '}, {'sentence_pos': 25, 'sentence_text': 'Mr, Henbt Jeffreys proposed “Prosperity to the town and trade of Stalybridge, with a few appropriate remarks. '}, {'sentence_pos': 26, 'sentence_text': 'After it had been duly drunk, the Chairman responded. '}, {'sentence_pos': 27, 'sentence_text': 'He referred to bad trade in general, and to that ef the bmild*'}] [{'mention_pos': 0, 'mention': 'STALYBRIDGE', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Stalybridge', 'wkdt_qid': 'Q1398653', 'mention_start': 4, 'mention_end': 15, 'sent_pos': 1}, {'mention_pos': 1, 'mention': 'Melbourne-street', 'entity_type': 'STREET', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 59, 'mention_end': 75, 'sent_pos': 3}, {'mention_pos': 2, 'mention': 'Dog and Partridge', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 6, 'mention_end': 23, 'sent_pos': 6}, {'mention_pos': 3, 'mention': 'Market-street', 'entity_type': 'STREET', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 25, 'mention_end': 38, 'sent_pos': 6}, {'mention_pos': 4, 'mention': 'Stalybridge', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Stalybridge', 'wkdt_qid': 'Q1398653', 'mention_start': 65, 'mention_end': 76, 'sent_pos': 25}] Ashton-under-Lyne 1860 1869 0.89 0.1803 The Ashton Reporter. 968 Q659803 train train train test dev train train train train train train train train +8199709 [{'sentence_pos': 1, 'sentence_text': 'NOTICE.'}, {'sentence_pos': 2, 'sentence_text': '—REGULAR WEEKLY ATTENDANCE, TEETH. '}, {'sentence_pos': 3, 'sentence_text': 'TEETH. '}, {'sentence_pos': 4, 'sentence_text': 'Messrs molloy,surgeon dentists, of 112, Rusholme-road (near All Saints), Manchester, rm-y be consulted every Saturday, from Ten till Five o’clock, at 931 Stamford-slreet, Ashton, tee residence of Mr. '}, {'sentence_pos': 5, 'sentence_text': 'Bostock, Chemist. '}, {'sentence_pos': 6, 'sentence_text': 'They continue to suppy their unrivalled MINERAL TEETH and ARTIFICIAL GUMS, which restore both the appearance of natural teeth and their usefulness in mastication. '}, {'sentence_pos': 7, 'sentence_text': 'Their method of fixing them defies detection and dispenses with all painful operations. '}, {'sentence_pos': 8, 'sentence_text': 'Mersrs. '}, {'sentence_pos': 9, 'sentence_text': 'Molloy’s method of applying the PATENT GUMCOLOURED VULCANITE restores the deficiency of the gums and teeth, the natural appearance of the featu-es, and is particularly applicable in those cases in which from old age or long loss of the teeth the gums have become sarnnk or wasted. '}, {'sentence_pos': 10, 'sentence_text': 'By theuseof this perfectly incorrodible, light, and flexible material, from one to a com pie e set of aitifioial teeth may be worn with the greatest comfort, perfo-ming all the functions of natu-ai teeth without causing the slightest pain or inconvenience, and can be removed and replaced with the greatest ease. '}, {'sentence_pos': 11, 'sentence_text': 'Tneir WHITE ENAMEL restores black and decayed teeth to tneir original whiteness, prevents toothache, and makes a hollow tooth sound and useful for many years. '}, {'sentence_pos': 12, 'sentence_text': 'Terms—A single teoth from ss. '}, {'sentence_pos': 13, 'sentence_text': 'Attendance every Saturday at 231, Stamford-street, Ashton'}] [{'mention_pos': 0, 'mention': 'Rusholme-road', 'entity_type': 'STREET', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 40, 'mention_end': 53, 'sent_pos': 4}, {'mention_pos': 1, 'mention': 'All Saints', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 60, 'mention_end': 70, 'sent_pos': 4}, {'mention_pos': 2, 'mention': 'Manchester', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Manchester', 'wkdt_qid': 'Q18125', 'mention_start': 73, 'mention_end': 83, 'sent_pos': 4}, {'mention_pos': 3, 'mention': 'Stamford-slreet', 'entity_type': 'STREET', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 154, 'mention_end': 169, 'sent_pos': 4}, {'mention_pos': 4, 'mention': 'Ashton', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 171, 'mention_end': 177, 'sent_pos': 4}, {'mention_pos': 5, 'mention': 'Stamford-street', 'entity_type': 'STREET', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 34, 'mention_end': 49, 'sent_pos': 13}, {'mention_pos': 6, 'mention': 'Ashton', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ashton-under-Lyne', 'wkdt_qid': 'Q659803', 'mention_start': 51, 'mention_end': 57, 'sent_pos': 13}] Ashton-under-Lyne 1860 1862 0.8747 0.1962 Ashton and Stalybridge Reporter, etc. 967 Q659803 train train train test dev train train train train train train train train +3580760 "[{'sentence_pos': 1, 'sentence_text': 'Postscript. '}, {'sentence_pos': 2, 'sentence_text': 'LONDON, THURSDAY, SEPTEMBER 27. '}, {'sentence_pos': 3, 'sentence_text': 'City, Twelve oClock.'}, {'sentence_pos': 4, 'sentence_text': '—The consol market opened this morning at 84A lor the account, but has since been heavy, fluctuating between 83| and 84, and has now declined to 83£ for money, and 83$ J for the account. '}, {'sentence_pos': 5, 'sentence_text': 'Four oJClock—Consols for Account, 83J. '}, {'sentence_pos': 6, 'sentence_text': 'By the Romona, steam boat, we have received accounts from Oporto to the 21st; and from the fleet, which was in latitude 37. 21., longitude 11. 37., to the 18th instant. '}, {'sentence_pos': 7, 'sentence_text': 'The two fleets remained in sight of each other, and it was expected would come to an engagement as soon as Sartorius had been joined by the vessels which he was then expecting at Oporto. '}, {'sentence_pos': 8, 'sentence_text': 'Affairs remained in nearly the same state as when the last accounts left. '}, {'sentence_pos': 9, 'sentence_text': 'He have seen a letter, of which the following is an extract, from Lieutenant-Colonel Hodges:—""On the 16th instant we had a brilliant affair. '}, {'sentence_pos': 10, 'sentence_text': 'We drove the Miguelites betore us from all parts. '}, {'sentence_pos': 11, 'sentence_text': 'My little British band crowned themselves with glory. '}, {'sentence_pos': 12, 'sentence_text': 'Never was there any thing more brave than their repulsing the enemy from the heights . they actually fled before us, and lost one field officer and 50 men. '}, {'sentence_pos': 13, 'sentence_text': 'Santa Martha commanded in person against my part of the line, and had three regiments of the line, one of volunteers, and one of militia, against 200 British and 150 Portuguese. 1 lost one officer, Colonel Staunton, one officer wounded, two men killed, and 17 wounded."" '}, {'sentence_pos': 14, 'sentence_text': 'Frankfort Fair—Sept. 14—The first week of our corn fair this year has not otTered any satisfactory result, and we have to add thatour autumn crop must be reckoned among the worst that we have yet had. '}, {'sentence_pos': 15, 'sentence_text': 'No important transactions have taken place as yet in any article. '}, {'sentence_pos': 16, 'sentence_text': 'We no longer observe that spirit of activity which used to reign in our streets, especially at harvest time. '}, {'sentence_pos': 17, 'sentence_text': 'The inns alone have benefited by the presence of foreign travellers, and of those who have come to our town through business. '}, {'sentence_pos': 18, 'sentence_text': 'Notwithstanding the sad state of our trade in this circumstance, it cannot be said to be more wretched than that of Offenbach, only as our merchants transport their goods from that town, when they effect the sale personally, the principal mass of money entering, consequently, into their coffers, gave a little more animation to the fair. '}, {'sentence_pos': 19, 'sentence_text': 'Smuggling is going on actively; it is even said that there are companies who insure that kind of speculation. '}, {'sentence_pos': 20, 'sentence_text': 'The value of houses and other fixtures decreases considerably in our town, while it rises at Offenbach. '}, {'sentence_pos': 21, 'sentence_text': 'The number of poor increases daily, as well on account of the bad harvest as of the exorbitant taxes which they are obliged to pay to the customs for the introduction of their merchandize. '}, {'sentence_pos': 22, 'sentence_text': 'Their progressive increase has engaged our Senate to take measures for the improvement of the asylums which are destined for them, for the keeping up of which the subscriptions of the philanthropic citizens are no Longer sufficient Suabian Mercury,'}]" [{'mention_pos': 0, 'mention': 'LONDON', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/London', 'wkdt_qid': 'Q84', 'mention_start': 0, 'mention_end': 6, 'sent_pos': 2}, {'mention_pos': 1, 'mention': 'City', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/City_of_London', 'wkdt_qid': 'Q23311', 'mention_start': 0, 'mention_end': 4, 'sent_pos': 3}, {'mention_pos': 2, 'mention': 'Oporto', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Porto', 'wkdt_qid': 'Q36433', 'mention_start': 58, 'mention_end': 64, 'sent_pos': 6}, {'mention_pos': 3, 'mention': 'Oporto', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Porto', 'wkdt_qid': 'Q36433', 'mention_start': 179, 'mention_end': 185, 'sent_pos': 7}, {'mention_pos': 4, 'mention': 'Frankfort', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Frankfurt', 'wkdt_qid': 'Q1794', 'mention_start': 0, 'mention_end': 9, 'sent_pos': 14}, {'mention_pos': 5, 'mention': 'Offenbach', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Offenbach_am_Main', 'wkdt_qid': 'Q3042', 'mention_start': 116, 'mention_end': 125, 'sent_pos': 18}, {'mention_pos': 6, 'mention': 'Offenbach', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Offenbach_am_Main', 'wkdt_qid': 'Q3042', 'mention_start': 93, 'mention_end': 102, 'sent_pos': 20}] Manchester 1830 1832 0.8929 0.1848 Manchester Courier and Lancashire General Advertiser. 206 Q18125 train train train train train train train train train train test train train +8263166 "[{'sentence_pos': 1, 'sentence_text': 'MONTHLY AGRICULTURAL REPORT. '}, {'sentence_pos': 2, 'sentence_text': 'The mild open weather through the month has revived those plants ot Wheat, uhieh on very light soils appeared much injured by the severity of the late frost. llieir general appearance is promising. '}, {'sentence_pos': 3, 'sentence_text': 'The markets for all kindsof Com continue depressed. '}, {'sentence_pos': 4, 'sentence_text': 'The deficiency of Turnips is severely felt in Norfolk, and odier counties, where thev depend so much upon them to top their spring beasts. 1 n the Northern districts they are generally good Coleseed for a crop is doing well. '}, {'sentence_pos': 5, 'sentence_text': 'Winter Tares and Rye. for early feed, in most parts look kindly. '}, {'sentence_pos': 6, 'sentence_text': 'Clover seed is expected to turn out a fair average crop. '}, {'sentence_pos': 7, 'sentence_text': 'On dry roils Beaus are getting in, and some early Peas alxi; bu; the heavy rains which fell in die mhidle of the mouth, will retard tlieir souring on strong lands— I/can 1 leasts are scarce, and somewhat dearer; lx it Store Sheep continue low priced. '}, {'sentence_pos': 8, 'sentence_text': 'South field for the last two or three weeks has been but thinly supplied with prime Beef, but with plenty of good"" Mutton ; House Lamb is rather scarce, aud at present dear. '}, {'sentence_pos': 9, 'sentence_text': 'Pork is reasonable. '}, {'sentence_pos': 10, 'sentence_text': 'Hops continue low and dull in silo, and the Wool market is rather more brisk for both long and short fleeces; but with little variation from the last mouths prices.'}]" [{'mention_pos': 0, 'mention': 'Norfolk', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Norfolk', 'wkdt_qid': 'Q23109', 'mention_start': 46, 'mention_end': 53, 'sent_pos': 4}] Manchester 1820 1821 0.8485 0.2334 The Manchester Mercury and Harrops General Advertiser 239 Q18125 train train train train train train train train train test train train dev +10734579 [{'sentence_pos': 1, 'sentence_text': 'NOTICE.'}, {'sentence_pos': 2, 'sentence_text': '—How delightful to see a Lady or Gentleman’s beautiful black shining Hoot or Shoe reflecting every surrounding object in TURNER’S UNRIVALLED BLACKING. '}, {'sentence_pos': 3, 'sentence_text': 'This invaluable composition may behad at the Dorset County Chronicle Office, and of all the respectable shop, keepers in Dorchester and its vicinity, in stone bottles, at 6u.t Is., and 18d. each. '}, {'sentence_pos': 4, 'sentence_text': 'A CH A t.I.r.NG R. 1 challenge the world to produce, if it can, A Blacking that’s equal to Turner’s Japan : For like crystal it shines, while it softens the leather. '}, {'sentence_pos': 5, 'sentence_text': 'And makes your boots proof gainst the wind and.the weather. '}, {'sentence_pos': 6, 'sentence_text': 'Why the praises of Turners Japan should I tell. '}, {'sentence_pos': 7, 'sentence_text': 'When those who have used it must know it so well ? '}, {'sentence_pos': 8, 'sentence_text': 'Why should coachmen still brush at old harness in vain, When this blacking can give it fresh lustre again! '}, {'sentence_pos': 9, 'sentence_text': 'For, without second sight, I can prophecy soon, That you will see it adopted by Foot and Dragoon ; For you scarcely can tell, when drawn up on parade, Which glitters the brightest, the boot or the blade. '}, {'sentence_pos': 10, 'sentence_text': 'The Gentlemen too, who would boast their attire. '}, {'sentence_pos': 11, 'sentence_text': 'And feel for respect so profound a desire. '}, {'sentence_pos': 12, 'sentence_text': 'May hear the fair Ladies, if Turners they uae, Cry, How charmingly polished his manners and shoes* Vc Authors and Poets who gladly engage To reform and instruct this degenerate age. '}, {'sentence_pos': 13, 'sentence_text': 'Use Turners Japan and your fame will take root. '}, {'sentence_pos': 14, 'sentence_text': 'Then your genius will shine as well as your boot. 1 hough Davy and Home have astonished the world. '}, {'sentence_pos': 15, 'sentence_text': 'And the chemical volume of nature unfurl’d, Yet in chemistry surely there something was lacking, TUI Turner discover’d his wonderful Blacking. '}, {'sentence_pos': 16, 'sentence_text': 'Who do not feel pride in a Wellingtons name. '}, {'sentence_pos': 17, 'sentence_text': 'When the whole of the universe rings with his fame 1 So arc Turner and Wellinoion famous afar, One the hero of Blacking, the other of War!! '}, {'sentence_pos': 18, 'sentence_text': 'Gentlemen may observe that this Composition, when wed for their Gig and Carriage Harness, after one or two application* will produce a brilliant, rich, glossy black lustre, and it the same time act as a preserver of the leather, RICHARD IX RNER.'}] [{'mention_pos': 0, 'mention': 'Dorset', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Dorset', 'wkdt_qid': 'Q23159', 'mention_start': 45, 'mention_end': 51, 'sent_pos': 3}, {'mention_pos': 1, 'mention': 'Dorchester', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Dorchester,_Dorset', 'wkdt_qid': 'Q503331', 'mention_start': 121, 'mention_end': 131, 'sent_pos': 3}] Dorchester 1820 1825 0.8665 0.221 Dorset County Chronicle, etc. 408 Q503331 train train train train test train train train train dev train dev train +3896074 [{'sentence_pos': 1, 'sentence_text': 'A REMONSTRANCE. '}, {'sentence_pos': 2, 'sentence_text': 'What bard art thou so apt to grace. '}, {'sentence_pos': 3, 'sentence_text': 'With poetrys pure breath, The icatterer of the human race ! '}, {'sentence_pos': 4, 'sentence_text': 'The trafficker in death ! '}, {'sentence_pos': 5, 'sentence_text': 'Are plunderd shrine, and midnight chain— A shrieking citys blaze— And beauty, in polution slain, The themes for poets praise ? '}, {'sentence_pos': 6, 'sentence_text': 'Marengos hero then recall! '}, {'sentence_pos': 7, 'sentence_text': 'Dethrone the crowned of earth ! '}, {'sentence_pos': 8, 'sentence_text': 'Shake oer a weeping world deaths pallCheer Horrors brood to birth ! '}, {'sentence_pos': 9, 'sentence_text': 'While startled bones of millions rise All ghastly as they stood When thunderous battle mock d the skies, And lained down human blood! '}, {'sentence_pos': 10, 'sentence_text': 'Whilst hoa y heads, all stark and gashed, Throng shore and town remote, As when the Gallic armies plashed In carnage to the throat! '}, {'sentence_pos': 11, 'sentence_text': 'His meteor-sceptre pledge once more; Napoleon to the van I Come, quaff the reeking cup of gore, And shout for slaughter !'}, {'sentence_pos': 12, 'sentence_text': '—Man. '}, {'sentence_pos': 13, 'sentence_text': 'Another Austerlitz demand, Another Jena claim; And desolate the groaning lan 1 To write one despots fame ! '}, {'sentence_pos': 14, 'sentence_text': 'Let cen the pyramids afford A verse for his renown : But speak not of brave Sidneys sword, That swept the invader down I Sing not of Moscows flaming tide— The fiery brands which hurled The chainer of the nations wide, The scourfer of the world. '}, {'sentence_pos': 15, 'sentence_text': 'With rout and havoc from their shore ! '}, {'sentence_pos': 16, 'sentence_text': 'To Cossack hate consigned :— Famine and frozen wastes before, Bones and the wolf behind. '}, {'sentence_pos': 17, 'sentence_text': 'Let net Trafalgars chief, who died, A moments thought beguile; Nor laud with British heart and pride The Baltic and the Nile! '}, {'sentence_pos': 18, 'sentence_text': 'Leave Nelsons glorious flag unsung, And Wellington unwreathed; Their fame with which all Europe rung, For his— whod best neer breathed ! '}, {'sentence_pos': 19, 'sentence_text': 'And gloze the tyrants guilty mood, And wail his hapless cause: That Sylla—in his thirst for blood ! '}, {'sentence_pos': 20, 'sentence_text': 'That Draco—in his laws ! '}, {'sentence_pos': 21, 'sentence_text': 'Mourn him who to the conscript gave HeHrt-broken France a prey; And sill could outraged nature brave With homicidal sway ! '}, {'sentence_pos': 22, 'sentence_text': 'Yes, wake the throb of sympathyBid maudlin tears reveal How much men grieve for Europe freeHow miss the tyrants steel! '}, {'sentence_pos': 23, 'sentence_text': 'And make the conscious sea blush gore In shame for Nelsons land ; Make earth, revolting, lift once more Her blood-accusing hand ! '}, {'sentence_pos': 24, 'sentence_text': 'CHARLES SWAIN.'}] [{'mention_pos': 0, 'mention': 'Austerlitz', 'entity_type': 'OTHER', 'wkpd_url': 'https://en.wikipedia.org/wiki/Battle_of_Austerlitz', 'wkdt_qid': 'Q134114', 'mention_start': 8, 'mention_end': 18, 'sent_pos': 13}, {'mention_pos': 1, 'mention': 'Jena', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Jena', 'wkdt_qid': 'Q3150', 'mention_start': 35, 'mention_end': 39, 'sent_pos': 13}, {'mention_pos': 2, 'mention': 'Moscows', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Moscow', 'wkdt_qid': 'Q649', 'mention_start': 133, 'mention_end': 140, 'sent_pos': 14}, {'mention_pos': 3, 'mention': 'Trafalgars', 'entity_type': 'OTHER', 'wkpd_url': 'https://en.wikipedia.org/wiki/Battle_of_Trafalgar', 'wkdt_qid': 'Q171416', 'mention_start': 8, 'mention_end': 18, 'sent_pos': 17}, {'mention_pos': 4, 'mention': 'The Baltic', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Baltic_Sea', 'wkdt_qid': 'Q545', 'mention_start': 101, 'mention_end': 111, 'sent_pos': 17}, {'mention_pos': 5, 'mention': 'Nile', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Nile', 'wkdt_qid': 'Q3392', 'mention_start': 120, 'mention_end': 124, 'sent_pos': 17}, {'mention_pos': 6, 'mention': 'Europe', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Europe', 'wkdt_qid': 'Q46', 'mention_start': 89, 'mention_end': 95, 'sent_pos': 18}, {'mention_pos': 7, 'mention': 'France', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/France', 'wkdt_qid': 'Q142', 'mention_start': 49, 'mention_end': 55, 'sent_pos': 21}, {'mention_pos': 8, 'mention': 'Europe', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Europe', 'wkdt_qid': 'Q46', 'mention_start': 80, 'mention_end': 86, 'sent_pos': 22}] Manchester 1830 1839 0.883 0.1934 Manchester Courier and Lancashire General Advertiser. 206 Q18125 train train train train train train train train train train test train train +3691199 [{'sentence_pos': 1, 'sentence_text': 'CHURCHES AND CHAPELS. '}, {'sentence_pos': 2, 'sentence_text': 'The incorporated society for promoting the enlargement, building, and repairing of churches and chapels, held their nineteenth annual general committee on Friday week, at their chambers in St. Martins-place'}, {'sentence_pos': 3, 'sentence_text': '; the Archbishop of Canterbury in the chair. '}, {'sentence_pos': 4, 'sentence_text': 'The Bishop of Durham, the Bishop of Winchester, the Bishop of Gloucester and Bristol, the Bishop of Hereford, the Bishop of Bangor, the Dean of Norwich, the Rev. '}, {'sentence_pos': 5, 'sentence_text': 'Sir Henry Dukenfield, Bart., Lord Kenyon, and many other distinguished clergymen and laymen, were also present. '}, {'sentence_pos': 6, 'sentence_text': 'The report of the committee stated that they felt great satisfaction and thankfulness towards the Giver of all Good in being able to announce that, during the past year, the society had exerted itself with undiminished activity and success. '}, {'sentence_pos': 7, 'sentence_text': 'Their exhausted funds had been replenished to a degree even beyond their anticipations by the collections made throughout the country under the authority of the kings letter. '}, {'sentence_pos': 8, 'sentence_text': 'All the returns had not yet been received, but those which had, had advised them of contributions to the amount of £34,000, which exceeded the collection under the same authority in 1834, by £3,000, and which was still more gratifying, as, during the last year, £116,000 had been subscribed for the erection of additional churches in the metropolis. '}, {'sentence_pos': 9, 'sentence_text': 'During the last year 188 applications had been made, and the grants had been 108, both exceeding those of any former year, the latter by 35. '}, {'sentence_pos': 10, 'sentence_text': 'The money granted had been £21,872, being £700 above any former year. '}, {'sentence_pos': 11, 'sentence_text': 'The sittings granted had been 41,710, being 7,000 above any precedent; and of those 28,872 were free and unappropriated. '}, {'sentence_pos': 12, 'sentence_text': 'The grants of last year were deemed particularly important, as while the society had contributed towards the increase of accommodation in many of the present churches they had contributed to the erection of forty-one additional churches and chapels, and the rebuilding of sixteen others. '}, {'sentence_pos': 13, 'sentence_text': 'Contributions had been received from diocesan associations in Bath and Wells, Winchester, Exeter, and Cleveland, which were gratifying proofs of the increasing desire to relieve the spiritual wants of the people of the Church of England. '}, {'sentence_pos': 14, 'sentence_text': 'AmjtJgst their contributions was the munificent sum of 18,000 from Mr. '}, {'sentence_pos': 15, 'sentence_text': 'George Davenport, of Lime-street, London. '}, {'sentence_pos': 16, 'sentence_text': 'Since their institution in 1818 the society had expended £220,731, thus insuring 354,925 additional sittings, of which 262,366 were free and unappropriated. '}, {'sentence_pos': 17, 'sentence_text': 'The society, therefore, looked with joy to the past, and to the future with the confident hope that they might go on and prosper.'}] [{'mention_pos': 0, 'mention': 'St. Martins-place', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 189, 'mention_end': 206, 'sent_pos': 2}, {'mention_pos': 1, 'mention': 'Bath', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Bath,_Somerset', 'wkdt_qid': 'Q22889', 'mention_start': 62, 'mention_end': 66, 'sent_pos': 13}, {'mention_pos': 2, 'mention': 'Wells', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Wells,_Somerset', 'wkdt_qid': 'Q212283', 'mention_start': 71, 'mention_end': 76, 'sent_pos': 13}, {'mention_pos': 3, 'mention': 'Winchester', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Winchester', 'wkdt_qid': 'Q172157', 'mention_start': 78, 'mention_end': 88, 'sent_pos': 13}, {'mention_pos': 4, 'mention': 'Exeter', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Exeter', 'wkdt_qid': 'Q134672', 'mention_start': 90, 'mention_end': 96, 'sent_pos': 13}, {'mention_pos': 5, 'mention': 'Cleveland', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Cleveland,_England', 'wkdt_qid': 'Q24651706', 'mention_start': 102, 'mention_end': 111, 'sent_pos': 13}, {'mention_pos': 6, 'mention': 'England', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/England', 'wkdt_qid': 'Q21', 'mention_start': 229, 'mention_end': 236, 'sent_pos': 13}, {'mention_pos': 7, 'mention': 'Lime-street', 'entity_type': 'STREET', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 21, 'mention_end': 32, 'sent_pos': 15}, {'mention_pos': 8, 'mention': 'London', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/London', 'wkdt_qid': 'Q84', 'mention_start': 34, 'mention_end': 40, 'sent_pos': 15}] Manchester 1830 1837 0.92 0.1529 Manchester Courier and Lancashire General Advertiser. 206 Q18125 train train train train train train train train train train test train train +12275 [{'sentence_pos': 1, 'sentence_text': 'LYND H UR ST. '}, {'sentence_pos': 2, 'sentence_text': 'CAPITAL RUN WITH THE New FOREST Fox HOUND. '}, {'sentence_pos': 3, 'sentence_text': '—On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. '}, {'sentence_pos': 4, 'sentence_text': '%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. '}, {'sentence_pos': 5, 'sentence_text': 'He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction • distance of five miles to Over Fields ; then took • backward direction to Holme !Jill'}, {'sentence_pos': 6, 'sentence_text': ', thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. '}, {'sentence_pos': 7, 'sentence_text': 'RESIGNATION Of Elll TIMSON, MASTEI OF THE New FOREST Fox HOUNDS.— We regret to announce the resignation of Mr. fimson, the esteemed master of the New Forest hourids, who has hunted the country for the last five seasons, to the entire satisfaction of those gentlemen, members of the hunt and others, who have been fertunate enough to witness some of the splendid sport which he has so often afforded during his career as such ; and although he relinquishes the mastership, we earnestly hope that a continuance of good health will enahle Mr. f imson to take is usual prominent place in the bunting field for many years to come. '}, {'sentence_pos': 8, 'sentence_text': 'It should be remembered that Mr. '}, {'sentence_pos': 9, 'sentence_text': 'Timson accepted the mastership at a time when no other gentleman could be prevailed upon to do so ; it is therefore to be hoped that the members of the hunt will testify their feeling of gratitude and respect either in the shape of a testimonial or farewell banquet. '}, {'sentence_pos': 10, 'sentence_text': 'Mr. '}, {'sentence_pos': 11, 'sentence_text': 'Timson will be succeeded by Captain W. '}, {'sentence_pos': 12, 'sentence_text': 'Fdornat. '}, {'sentence_pos': 13, 'sentence_text': 'SHIPPING INTELLIGENCr:. '}, {'sentence_pos': 14, 'sentence_text': 'POOLE.'}] [{'mention_pos': 0, 'mention': 'LYND H UR ST', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Lyndhurst,_Hampshire', 'wkdt_qid': 'Q3182986', 'mention_start': 0, 'mention_end': 12, 'sent_pos': 1}, {'mention_pos': 1, 'mention': 'New FOREST', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/New_Forest', 'wkdt_qid': 'Q277755', 'mention_start': 21, 'mention_end': 31, 'sent_pos': 2}, {'mention_pos': 2, 'mention': 'New Forest', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/New_Forest', 'wkdt_qid': 'Q277755', 'mention_start': 35, 'mention_end': 45, 'sent_pos': 3}, {'mention_pos': 3, 'mention': 'Boldrewood', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Bolderwood,_Hampshire', 'wkdt_qid': 'Q4939103', 'mention_start': 64, 'mention_end': 74, 'sent_pos': 3}, {'mention_pos': 4, 'mention': 'Ringwood', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Ringwood', 'wkdt_qid': 'Q1248943', 'mention_start': 188, 'mention_end': 196, 'sent_pos': 3}, {'mention_pos': 5, 'mention': 'Church Moor', 'entity_type': 'LOC', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 41, 'mention_end': 52, 'sent_pos': 4}, {'mention_pos': 6, 'mention': 'Burley', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Burley,_Hampshire', 'wkdt_qid': 'Q3195509', 'mention_start': 84, 'mention_end': 90, 'sent_pos': 4}, {'mention_pos': 7, 'mention': 'Burley', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Burley,_Hampshire', 'wkdt_qid': 'Q3195509', 'mention_start': 101, 'mention_end': 107, 'sent_pos': 4}, {'mention_pos': 8, 'mention': 'Beech Beds', 'entity_type': 'LOC', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 147, 'mention_end': 157, 'sent_pos': 4}, {'mention_pos': 9, 'mention': 'Oakley', 'entity_type': 'LOC', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 175, 'mention_end': 181, 'sent_pos': 4}, {'mention_pos': 10, 'mention': 'Buldre. wood', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Bolderwood,_Hampshire', 'wkdt_qid': 'Q4939103', 'mention_start': 30, 'mention_end': 42, 'sent_pos': 5}, {'mention_pos': 11, 'mention': 'Gillett Inclosure', 'entity_type': 'LOC', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 51, 'mention_end': 68, 'sent_pos': 5}, {'mention_pos': 12, 'mention': 'Hulme Hill', 'entity_type': 'LOC', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 82, 'mention_end': 92, 'sent_pos': 5}, {'mention_pos': 13, 'mention': 'Thrifty Beeches', 'entity_type': 'LOC', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 97, 'mention_end': 112, 'sent_pos': 5}, {'mention_pos': 14, 'mention': 'Over Fields', 'entity_type': 'LOC', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 216, 'mention_end': 227, 'sent_pos': 5}, {'mention_pos': 15, 'mention': 'Holme !Jill', 'entity_type': 'LOC', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 264, 'mention_end': 275, 'sent_pos': 5}, {'mention_pos': 16, 'mention': 'Emery Down', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Emery_Down', 'wkdt_qid': 'Q5370984', 'mention_start': 12, 'mention_end': 22, 'sent_pos': 6}, {'mention_pos': 17, 'mention': 'Minesteed Manor', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 34, 'mention_end': 49, 'sent_pos': 6}, {'mention_pos': 18, 'mention': 'Notherwood', 'entity_type': 'BUILDING', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 75, 'mention_end': 85, 'sent_pos': 6}, {'mention_pos': 19, 'mention': 'New FOREST', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/New_Forest', 'wkdt_qid': 'Q277755', 'mention_start': 42, 'mention_end': 52, 'sent_pos': 7}, {'mention_pos': 20, 'mention': 'New Forest', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/New_Forest', 'wkdt_qid': 'Q277755', 'mention_start': 146, 'mention_end': 156, 'sent_pos': 7}, {'mention_pos': 21, 'mention': 'POOLE', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Poole', 'wkdt_qid': 'Q203349', 'mention_start': 0, 'mention_end': 5, 'sent_pos': 14}] Poole 1860 1860 0.879 0.1578 The Poole and South-Western Herald, etc. 2325 Q203349 train train train train train train train train train train dev train test +12670 [{'sentence_pos': 1, 'sentence_text': 'NOTICE. '}, {'sentence_pos': 2, 'sentence_text': 'THE STAR, Political, Naval, Military, I Literary, and Commercial Intelligencer, and General Advertiser. '}, {'sentence_pos': 3, 'sentence_text': 'Established in 1813.'}, {'sentence_pos': 4, 'sentence_text': '—This paper which is published every Tuesday, Thursday, and Saturday evening, contains, in addition to ectracts from the British and Foreign newspapers of articles of political and general intelligence, copious and carefully-selected information as to all naval, military, and commercial affairs, and especially that which relates to the trade and commerce of the Channel Islands. '}, {'sentence_pos': 5, 'sentence_text': 'The following are the subscription prices to The Star, payable quarterly :-For three numbers per week For two do. do. '}, {'sentence_pos': 6, 'sentence_text': 'For one do. do. 2s. do. '}, {'sentence_pos': 7, 'sentence_text': 'Single numbers, 2d. each. '}, {'sentence_pos': 8, 'sentence_text': 'Subscribers in the United Kingdom will be charged on the following scale, in British money, including pre-payment and posting, payable in advance : For three numbers per week....9s. 9d. per quarter For two do. do. ....Bs. '}, {'sentence_pos': 9, 'sentence_text': 'Bd. do. '}, {'sentence_pos': 10, 'sentence_text': 'For one do. do. ....3s. 4d. do. '}, {'sentence_pos': 11, 'sentence_text': 'The Star (or three numbers folded together) can be forwarded by post, to any part of the United Kingdom or France, on allicing a postage stamp of one PENNI: sir Address—No. 10, Bordage-Street, Guernsey.'}] [{'mention_pos': 0, 'mention': 'Channel Islands', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Channel_Islands', 'wkdt_qid': 'Q42314', 'mention_start': 364, 'mention_end': 379, 'sent_pos': 4}, {'mention_pos': 1, 'mention': 'United Kingdom', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/United_Kingdom', 'wkdt_qid': 'Q145', 'mention_start': 19, 'mention_end': 33, 'sent_pos': 8}, {'mention_pos': 2, 'mention': 'Bordage-Street', 'entity_type': 'STREET', 'wkpd_url': '*', 'wkdt_qid': 'NIL', 'mention_start': 177, 'mention_end': 191, 'sent_pos': 11}, {'mention_pos': 3, 'mention': 'Guernsey', 'entity_type': 'LOC', 'wkpd_url': 'https://en.wikipedia.org/wiki/Guernsey', 'wkdt_qid': 'Q3311985', 'mention_start': 193, 'mention_end': 201, 'sent_pos': 11}] Poole 1860 1860 0.8953 0.1361 The Poole and South-Western Herald, etc. 2325 Q203349 dev train test train train train train train train train dev train test diff --git a/tests/sample_files/experiments/outputs/data/lwm/ner_fine_dev.json b/tests/sample_files/experiments/outputs/data/lwm/ner_fine_dev.json new file mode 100644 index 00000000..3532a1aa --- /dev/null +++ b/tests/sample_files/experiments/outputs/data/lwm/ner_fine_dev.json @@ -0,0 +1,41 @@ +{"id":"3896074_13","ner_tags":["O","B-OTHER","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Another","Austerlitz","demand",",","Another","Jena","claim",";","And","desolate","the","groaning","lan","1","To","write","one","despots","fame","!"]} +{"id":"3896074_10","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Whilst","hoa","y","heads",",","all","stark","and","gashed",",","Throng","shore","and","town","remote",",","As","when","the","Gallic","armies","plashed","In","carnage","to","the","throat","!"]} +{"id":"3580760_19","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Smuggling","is","going","on","actively",";","it","is","even","said","that","there","are","companies","who","insure","that","kind","of","speculation","."]} +{"id":"12275_11","ner_tags":["O","O","O","O","O","O","O","O"],"tokens":["Timson","will","be","succeeded","by","Captain","W","."]} +{"id":"12275_5","ner_tags":["O","O","O","O","O","O","O","B-LOC","I-LOC","I-LOC","O","O","B-LOC","I-LOC","O","O","O","B-LOC","I-LOC","O","O","B-LOC","I-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","I-LOC","O","O","O","O","O","O","O","B-LOC","I-LOC","I-LOC"],"tokens":["He","then","made","a","turn","back","over","Buldre",".","wood","Hill","to","Gillett","Inclosure",",","and","through","Hulme","Hill",",","to","Thrifty","Beeches",";","here","he","again","turned","to","the","lett",",","sod","ran","almost","in","a","straight","direction","\u2022","distance","of","five","miles","to","Over","Fields",";","then","took","\u2022","backward","direction","to","Holme","!","Jill"]} +{"id":"3691199_7","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Their","exhausted","funds","had","been","replenished","to","a","degree","even","beyond","their","anticipations","by","the","collections","made","throughout","the","country","under","the","authority","of","the","kings","letter","."]} +{"id":"3896074_5","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Are","plunderd","shrine",",","and","midnight","chain","\u2014","A","shrieking","citys","blaze","\u2014","And","beauty",",","in","polution","slain",",","The","themes","for","poets","praise","?"]} +{"id":"4939308_12","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["After","complimenting","the","workmen","on","having","done","their","duty","to","the","two","shops",",","he","alluded","to","the","dangers","connected","with","the","building","trade","from","the","excavator","to","the","slater",",","and","said","that","no","buildings","in","town","surpassed","those","which","had","led","to","that","night","\u2019","s","gathering","in","drainage","and","other","sanitory","arrangements","."]} +{"id":"12670_7","ner_tags":["O","O","O","O","O","O","O"],"tokens":["Single","numbers",",","2d",".","each","."]} +{"id":"12275_3","ner_tags":["O","O","O","O","O","O","O","O","O","B-LOC","I-LOC","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O"],"tokens":["\u2014","On","Thursday",",","the","9th","instant",",","the","New","Forest","fox","hounds","met","at","Boldrewood",",","arid","drew","Holme","Hill","Inclousure",",","where","the","famous","bitch","pock","soon","winded","a","Inc","old","fox",",","who","stole","away","over","the","Ringwood","rued","to","Gillett","Inclusure","."]} +{"id":"3896074_7","ner_tags":["O","O","O","O","O","O"],"tokens":["Dethrone","the","crowned","of","earth","!"]} +{"id":"3938653_12","ner_tags":["O","O"],"tokens":["Mr","."]} +{"id":"3580760_5","ner_tags":["O","O","O","O","O","O","O"],"tokens":["Four","oJClock\u2014Consols","for","Account",",","83J","."]} +{"id":"4939308_13","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Every","aperture","was","properly","trapped",",","an","important","considei","ation",",","seeing","that","the","medical","men","of","all","large","towns","declared","that","the","great","causes","of","disease","were","impure","water","and","unwholesome","stenches","."]} +{"id":"12670_10","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["For","one","do",".","do",".",".",".",".",".3s",".","4d",".","do","."]} +{"id":"4939308_23","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["In","conclusion",",","he","gave","a","composition","of","his","own",",","entitled","\u201c","Defence",",","not","defiance",".","\u201d"]} +{"id":"12670_11","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-STREET","O","B-LOC","O"],"tokens":["The","Star","(","or","three","numbers","folded","together",")","can","be","forwarded","by","post",",","to","any","part","of","the","United","Kingdom","or","France",",","on","allicing","a","postage","stamp","of","one","PENNI",":","sir","Address\u2014No",".","10",",","Bordage-Street",",","Guernsey","."]} +{"id":"10734579_11","ner_tags":["O","O","O","O","O","O","O","O","O"],"tokens":["And","feel","for","respect","so","profound","a","desire","."]} +{"id":"12275_14","ner_tags":["B-LOC","O"],"tokens":["POOLE","."]} +{"id":"3691199_13","ner_tags":["O","O","O","O","O","O","O","O","B-LOC","O","B-LOC","O","B-LOC","O","B-LOC","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O"],"tokens":["Contributions","had","been","received","from","diocesan","associations","in","Bath","and","Wells",",","Winchester",",","Exeter",",","and","Cleveland",",","which","were","gratifying","proofs","of","the","increasing","desire","to","relieve","the","spiritual","wants","of","the","people","of","the","Church","of","England","."]} +{"id":"8199709_8","ner_tags":["O","O"],"tokens":["Mersrs","."]} +{"id":"8263166_2","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","mild","open","weather","through","the","month","has","revived","those","plants","ot","Wheat",",","uhieh","on","very","light","soils","appeared","much","injured","by","the","severity","of","the","late","frost",".","llieir","general","appearance","is","promising","."]} +{"id":"4939308_24","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","toast","was","well","received",",","the","Chairman","speaking","very","highly","of","the","volunteers","of","the","country",",","and","pointing","out","that",",","if","properly","managed",",","they","would","very","much","tend","to","a","decrease","in","the","annual","military","and","naval","expenditure","of","the","country","."]} +{"id":"3896074_6","ner_tags":["O","O","O","O","O"],"tokens":["Marengos","hero","then","recall","!"]} +{"id":"3580760_20","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O"],"tokens":["The","value","of","houses","and","other","fixtures","decreases","considerably","in","our","town",",","while","it","rises","at","Offenbach","."]} +{"id":"10734579_2","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["\u2014","How","delightful","to","see","a","Lady","or","Gentleman","\u2019","s","beautiful","black","shining","Hoot","or","Shoe","reflecting","every","surrounding","object","in","TURNER","\u2019","S","UNRIVALLED","BLACKING","."]} +{"id":"8263166_9","ner_tags":["O","O","O","O"],"tokens":["Pork","is","reasonable","."]} +{"id":"3896074_14","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Let","cen","the","pyramids","afford","A","verse","for","his","renown",":","But","speak","not","of","brave","Sidneys","sword",",","That","swept","the","invader","down","I","Sing","not","of","Moscows","flaming","tide","\u2014","The","fiery","brands","which","hurled","The","chainer","of","the","nations","wide",",","The","scourfer","of","the","world","."]} +{"id":"3691199_12","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","grants","of","last","year","were","deemed","particularly","important",",","as","while","the","society","had","contributed","towards","the","increase","of","accommodation","in","many","of","the","present","churches","they","had","contributed","to","the","erection","of","forty-one","additional","churches","and","chapels",",","and","the","rebuilding","of","sixteen","others","."]} +{"id":"12670_6","ner_tags":["O","O","O","O","O","O","O","O","O","O"],"tokens":["For","one","do",".","do",".","2s",".","do","."]} +{"id":"10734579_3","ner_tags":["O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["This","invaluable","composition","may","behad","at","the","Dorset","County","Chronicle","Office",",","and","of","all","the","respectable","shop",",","keepers","in","Dorchester","and","its","vicinity",",","in","stone","bottles",",","at","6u.t","Is",".",",","and","18d",".","each","."]} +{"id":"12670_5","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","following","are","the","subscription","prices","to","The","Star",",","payable","quarterly",":","-","For","three","numbers","per","week","For","two","do",".","do","."]} +{"id":"10734579_12","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["May","hear","the","fair","Ladies",",","if","Turners","they","uae",",","Cry",",","How","charmingly","polished","his","manners","and","shoes","*","Vc","Authors","and","Poets","who","gladly","engage","To","reform","and","instruct","this","degenerate","age","."]} +{"id":"4939308_5","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["On","Thursday","evening","the","workmen","employed","thereon",",","and","others",",","to","the","number","of","forty",",","sat","dowu","to","a","spread","of","geese",",","turkey",",","and","other","good","things",",","at","the","house","of","Mr","."]} +{"id":"8263166_6","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Clover","seed","is","expected","to","turn","out","a","fair","average","crop","."]} +{"id":"3580760_6","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["By","the","Romona",",","steam","boat",",","we","have","received","accounts","from","Oporto","to","the","21st",";","and","from","the","fleet",",","which","was","in","latitude","37",".","21",".",",","longitude","11",".","37",".",",","to","the","18th","instant","."]} +{"id":"3938653_9","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-BUILDING","I-BUILDING","O","B-BUILDING","I-BUILDING","I-BUILDING","O"],"tokens":["In","the","afternoon","their","Royal","Highnesses","went","a","short","yachting","excursion","in","the","Earl","of","Edgecunibes","yacht",";","and","in","the","evening","the","Prince","dined","with","Viscount","Templetown",",","Commander-inChief","of","the","Western","District",",","at","the","Government","House",",","Mo","ant","Wise","."]} +{"id":"8263166_3","ner_tags":["O","O","O","O","O","O","O","O","O"],"tokens":["The","markets","for","all","kindsof","Com","continue","depressed","."]} +{"id":"10734579_15","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["And","the","chemical","volume","of","nature","unfurl","\u2019","d",",","Yet","in","chemistry","surely","there","something","was","lacking",",","TUI","Turner","discover","\u2019","d","his","wonderful","Blacking","."]} +{"id":"10734579_10","ner_tags":["O","O","O","O","O","O","O","O","O","O"],"tokens":["The","Gentlemen","too",",","who","would","boast","their","attire","."]} +{"id":"8199709_9","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Molloy","\u2019","s","method","of","applying","the","PATENT","GUMCOLOURED","VULCANITE","restores","the","deficiency","of","the","gums","and","teeth",",","the","natural","appearance","of","the","featu-es",",","and","is","particularly","applicable","in","those","cases","in","which","from","old","age","or","long","loss","of","the","teeth","the","gums","have","become","sarnnk","or","wasted","."]} diff --git a/tests/sample_files/experiments/outputs/data/lwm/ner_fine_train.json b/tests/sample_files/experiments/outputs/data/lwm/ner_fine_train.json new file mode 100644 index 00000000..bcc2c70d --- /dev/null +++ b/tests/sample_files/experiments/outputs/data/lwm/ner_fine_train.json @@ -0,0 +1,141 @@ +{"id":"4939308_7","ner_tags":["O","O","O"],"tokens":["Afterwards","Mr","."]} +{"id":"8263166_7","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["On","dry","roils","Beaus","are","getting","in",",","and","some","early","Peas","alxi",";","bu",";","the","heavy","rains","which","fell","in","die","mhidle","of","the","mouth",",","will","retard","tlieir","souring","on","strong","lands","\u2014","I","\/","can","1","leasts","are","scarce",",","and","somewhat","dearer",";","lx","it","Store","Sheep","continue","low","priced","."]} +{"id":"3691199_9","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["During","the","last","year","188","applications","had","been","made",",","and","the","grants","had","been","108",",","both","exceeding","those","of","any","former","year",",","the","latter","by","35","."]} +{"id":"3896074_4","ner_tags":["O","O","O","O","O"],"tokens":["The","trafficker","in","death","!"]} +{"id":"8199709_12","ner_tags":["O","O","O","O","O","O"],"tokens":["Terms\u2014A","single","teoth","from","ss","."]} +{"id":"12670_2","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["THE","STAR",",","Political",",","Naval",",","Military",",","I","Literary",",","and","Commercial","Intelligencer",",","and","General","Advertiser","."]} +{"id":"8263166_8","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["South","field","for","the","last","two","or","three","weeks","has","been","but","thinly","supplied","with","prime","Beef",",","but","with","plenty","of","good","\"","Mutton",";","House","Lamb","is","rather","scarce",",","aud","at","present","dear","."]} +{"id":"3896074_8","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Shake","oer","a","weeping","world","deaths","pallCheer","Horrors","brood","to","birth","!"]} +{"id":"3691199_17","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","society",",","therefore",",","looked","with","joy","to","the","past",",","and","to","the","future","with","the","confident","hope","that","they","might","go","on","and","prosper","."]} +{"id":"4938614_4","ner_tags":["O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Martha","Wilde",",","who","was","sent","from","the","Dukinfield","court","for","obtaining","money","under","false","pretences","by","representing","at","two","pawnshops","in","Dukinfield","that","a","spurious","composition","called","coraline","beads","were","real","coral","was","discharged","."]} +{"id":"4938614_12","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["He","was","immediately","conveyed","in","a","cab","to","the","Infirmary",",","where","we","understand","he","is","progressing","favourably","."]} +{"id":"4939308_25","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O"],"tokens":["Mr",",","Henbt","Jeffreys","proposed","\u201c","Prosperity","to","the","town","and","trade","of","Stalybridge",",","with","a","few","appropriate","remarks","."]} +{"id":"3938653_4","ner_tags":["O","O"],"tokens":["Emma","."]} +{"id":"3896074_24","ner_tags":["O","O","O"],"tokens":["CHARLES","SWAIN","."]} +{"id":"3896074_15","ner_tags":["O","O","O","O","O","O","O","O"],"tokens":["With","rout","and","havoc","from","their","shore","!"]} +{"id":"10734579_8","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Why","should","coachmen","still","brush","at","old","harness","in","vain",",","When","this","blacking","can","give","it","fresh","lustre","again","!"]} +{"id":"3896074_23","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["And","make","the","conscious","sea","blush","gore","In","shame","for","Nelsons","land",";","Make","earth",",","revolting",",","lift","once","more","Her","blood-accusing","hand","!"]} +{"id":"3580760_21","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","number","of","poor","increases","daily",",","as","well","on","account","of","the","bad","harvest","as","of","the","exorbitant","taxes","which","they","are","obliged","to","pay","to","the","customs","for","the","introduction","of","their","merchandize","."]} +{"id":"12275_10","ner_tags":["O","O"],"tokens":["Mr","."]} +{"id":"8199709_5","ner_tags":["O","O","O","O"],"tokens":["Bostock",",","Chemist","."]} +{"id":"4938614_1","ner_tags":["B-LOC","O"],"tokens":["DUKINFIELD","."]} +{"id":"4938614_10","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Tame","Valley",",","was","engaged",",","on","Wednesday",",","in","cleaning","some","jennies","in","one","of","the","rooms","he","met","with","a","severe","accident","to","his","head",".","_","The","spinner",",","who","is","rather","deaf",",","not","knowing","that","the","lad","was","amongst","the","machinery",",","set","the","jennies","going",",","and","being","unable","to","hear","the","shouts","of","the","lad","on","account","of","the","defect","in","his","hearing",",","did","not","stop","them","until","another","man","made","signs","to","him","to","stop",",","which","he","did","instantly","."]} +{"id":"4939308_6","ner_tags":["O","O","B-BUILDING","I-BUILDING","I-BUILDING","O","B-STREET","O"],"tokens":["Bray",",","Dog","and","Partridge",",","Market-street","."]} +{"id":"12670_3","ner_tags":["O","O","O","O"],"tokens":["Established","in","1813","."]} +{"id":"3580760_1","ner_tags":["O","O"],"tokens":["Postscript","."]} +{"id":"3938653_10","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O"],"tokens":["The","party","consisted","principally","of","the","army","and","navy","officers","in","commission","at","the","port",",","andthe","officers","\u2022","f","the","French","squadron","lying","in","the","Sound","."]} +{"id":"4938614_11","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","poor","lad","had",",","however",",","been","severely","hurt","on","the","head",",","although","no","limbs","were","injured","."]} +{"id":"12275_8","ner_tags":["O","O","O","O","O","O","O"],"tokens":["It","should","be","remembered","that","Mr","."]} +{"id":"3691199_16","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Since","their","institution","in","1818","the","society","had","expended","\u00a3220,731",",","thus","insuring","354,925","additional","sittings",",","of","which","262,366","were","free","and","unappropriated","."]} +{"id":"3691199_5","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Sir","Henry","Dukenfield",",","Bart",".",",","Lord","Kenyon",",","and","many","other","distinguished","clergymen","and","laymen",",","were","also","present","."]} +{"id":"3580760_8","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Affairs","remained","in","nearly","the","same","state","as","when","the","last","accounts","left","."]} +{"id":"4939308_22","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["He","felt","sure","that","if","the","services","of","the","volunteers","should","be","required",",","they","would","worthily","emulate","the","deeds","of","old","."]} +{"id":"4939308_8","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Napoleon","Ives",",","who","paid","for","the","dinner",",","occupied","the","chair",",","and","Mr","."]} +{"id":"12275_2","ner_tags":["O","O","O","O","B-LOC","I-LOC","O","O","O"],"tokens":["CAPITAL","RUN","WITH","THE","New","FOREST","Fox","HOUND","."]} +{"id":"3938653_8","ner_tags":["O","O","O","O","O","O","B-BUILDING","I-BUILDING","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["A","dejeuner","was","given","to-day","at","Mount","Edgecumbe","to","about","100","of","the","elite","of","the","neighbourhood",",","invited","to","meet","the","Prince","and","Princess","of","Wales","."]} +{"id":"3896074_22","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O"],"tokens":["Yes",",","wake","the","throb","of","sympathyBid","maudlin","tears","reveal","How","much","men","grieve","for","Europe","freeHow","miss","the","tyrants","steel","!"]} +{"id":"3691199_11","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","sittings","granted","had","been","41,710",",","being","7,000","above","any","precedent",";","and","of","those","28,872","were","free","and","unappropriated","."]} +{"id":"3691199_2","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-BUILDING","I-BUILDING","I-BUILDING"],"tokens":["The","incorporated","society","for","promoting","the","enlargement",",","building",",","and","repairing","of","churches","and","chapels",",","held","their","nineteenth","annual","general","committee","on","Friday","week",",","at","their","chambers","in","St",".","Martins-place"]} +{"id":"10734579_5","ner_tags":["O","O","O","O","O","O","O","O","O","O","O"],"tokens":["And","makes","your","boots","proof","gainst","the","wind","and.the","weather","."]} +{"id":"3691199_3","ner_tags":["O","O","O","O","O","O","O","O","O"],"tokens":[";","the","Archbishop","of","Canterbury","in","the","chair","."]} +{"id":"4938614_2","ner_tags":["B-LOC","O","O"],"tokens":["Knutsford","Sessions","."]} +{"id":"3896074_20","ner_tags":["O","O","O","O","O"],"tokens":["That","Draco\u2014in","his","laws","!"]} +{"id":"8199709_1","ner_tags":["O","O"],"tokens":["NOTICE","."]} +{"id":"4939308_26","ner_tags":["O","O","O","O","O","O","O","O","O","O","O"],"tokens":["After","it","had","been","duly","drunk",",","the","Chairman","responded","."]} +{"id":"3691199_1","ner_tags":["O","O","O","O"],"tokens":["CHURCHES","AND","CHAPELS","."]} +{"id":"3691199_6","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","report","of","the","committee","stated","that","they","felt","great","satisfaction","and","thankfulness","towards","the","Giver","of","all","Good","in","being","able","to","announce","that",",","during","the","past","year",",","the","society","had","exerted","itself","with","undiminished","activity","and","success","."]} +{"id":"8199709_7","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Their","method","of","fixing","them","defies","detection","and","dispenses","with","all","painful","operations","."]} +{"id":"4939308_15","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["He","had",",","therefore",",","erected","baths","in","each","of","the","shops",",","which","could","be","supplied","with","either","hot","or","cold","water","."]} +{"id":"3896074_17","ner_tags":["O","O","B-OTHER","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","I-LOC","O","O","B-LOC","O"],"tokens":["Let","net","Trafalgars","chief",",","who","died",",","A","moments","thought","beguile",";","Nor","laud","with","British","heart","and","pride","The","Baltic","and","the","Nile","!"]} +{"id":"3580760_2","ner_tags":["B-LOC","O","O","O","O","O","O"],"tokens":["LONDON",",","THURSDAY",",","SEPTEMBER","27","."]} +{"id":"3896074_11","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["His","meteor-sceptre","pledge","once","more",";","Napoleon","to","the","van","I","Come",",","quaff","the","reeking","cup","of","gore",",","And","shout","for","slaughter","!"]} +{"id":"12275_7","ner_tags":["O","O","O","O","O","O","O","O","B-LOC","I-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","I-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["RESIGNATION","Of","Elll","TIMSON",",","MASTEI","OF","THE","New","FOREST","Fox","HOUNDS",".","\u2014","We","regret","to","announce","the","resignation","of","Mr",".","fimson",",","the","esteemed","master","of","the","New","Forest","hourids",",","who","has","hunted","the","country","for","the","last","five","seasons",",","to","the","entire","satisfaction","of","those","gentlemen",",","members","of","the","hunt","and","others",",","who","have","been","fertunate","enough","to","witness","some","of","the","splendid","sport","which","he","has","so","often","afforded","during","his","career","as","such",";","and","although","he","relinquishes","the","mastership",",","we","earnestly","hope","that","a","continuance","of","good","health","will","enahle","Mr",".","f","imson","to","take","is","usual","prominent","place","in","the","bunting","field","for","many","years","to","come","."]} +{"id":"12275_12","ner_tags":["O","O"],"tokens":["Fdornat","."]} +{"id":"3896074_19","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["And","gloze","the","tyrants","guilty","mood",",","And","wail","his","hapless","cause",":","That","Sylla\u2014in","his","thirst","for","blood","!"]} +{"id":"10734579_4","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["A","CH","A","t.I.r.NG","R",".","1","challenge","the","world","to","produce",",","if","it","can",",","A","Blacking","that","\u2019","s","equal","to","Turner","\u2019","s","Japan",":","For","like","crystal","it","shines",",","while","it","softens","the","leather","."]} +{"id":"8199709_3","ner_tags":["O","O"],"tokens":["TEETH","."]} +{"id":"4938614_8","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-BUILDING","I-BUILDING","I-BUILDING","I-BUILDING","I-BUILDING","I-BUILDING"],"tokens":["\u2014","Whilst","a","boy","named","Edwin","_","Diggle",",","14","years","of","age",",","a","pieoer","at","Mr",".","Chadwick","\u2019","s","factory"]} +{"id":"3938653_6","ner_tags":["O","O","O","O","O","O","O"],"tokens":["THE","PRINCE","AND","PRINCESS","OF","WALES","."]} +{"id":"4939308_11","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["It","was","one","which","had","been","almost","extinguished",",","but","he","had","always","thought","if","ever","it","should","be","his","fortune","to","erect","any","building",",","he","would","give","a","supper","to","the","workmen","engaged","in","its","erection","."]} +{"id":"3580760_9","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["He","have","seen","a","letter",",","of","which","the","following","is","an","extract",",","from","Lieutenant-Colonel","Hodges",":","\u2014","\"","On","the","16th","instant","we","had","a","brilliant","affair","."]} +{"id":"4938614_9","ner_tags":["O"],"tokens":["."]} +{"id":"4938614_3","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O"],"tokens":["\u2014","The","servant","girl",",","Eliza","Ann","Byrom",",","who","stole","a","quantity","of","clothes","from","the","house","where","she","lodged",",","in","Dukiafield",",","was","sentenced","to","two","months","\u2019","imprisonment","."]} +{"id":"10734579_13","ner_tags":["O","O","O","O","O","O","O","O","O","O"],"tokens":["Use","Turners","Japan","and","your","fame","will","take","root","."]} +{"id":"10734579_18","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Gentlemen","may","observe","that","this","Composition",",","when","wed","for","their","Gig","and","Carriage","Harness",",","after","one","or","two","application","*","will","produce","a","brilliant",",","rich",",","glossy","black","lustre",",","and","it","the","same","time","act","as","a","preserver","of","the","leather",",","RICHARD","IX","RNER","."]} +{"id":"10734579_7","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["When","those","who","have","used","it","must","know","it","so","well","?"]} +{"id":"4939308_18","ner_tags":["O","O"],"tokens":["Mr","."]} +{"id":"3896074_18","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O"],"tokens":["Leave","Nelsons","glorious","flag","unsung",",","And","Wellington","unwreathed",";","Their","fame","with","which","all","Europe","rung",",","For","his","\u2014","whod","best","neer","breathed","!"]} +{"id":"4938614_6","ner_tags":["O","O","O","O","O","O"],"tokens":["Brandt","appeared","for","the","prisoner","."]} +{"id":"12275_6","ner_tags":["O","O","O","B-LOC","I-LOC","O","O","O","B-BUILDING","I-BUILDING","O","O","O","O","O","O","B-BUILDING","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":[",","thence","to","Emery","Down",",","crowing","to","Minesteed","Manor",";","he","ther","tacked","back","to","Notherwood",",","and","from","thence","back","again","to","the","Manor",",","where",",","after","a","brilliant","run","(","Arnie","hour","and","forty-five","minutes",",","Reynold","was","compelled","to","succumb","to","his","pursuers","."]} +{"id":"3580760_7","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O"],"tokens":["The","two","fleets","remained","in","sight","of","each","other",",","and","it","was","expected","would","come","to","an","engagement","as","soon","as","Sartorius","had","been","joined","by","the","vessels","which","he","was","then","expecting","at","Oporto","."]} +{"id":"3896074_2","ner_tags":["O","O","O","O","O","O","O","O","O"],"tokens":["What","bard","art","thou","so","apt","to","grace","."]} +{"id":"3938653_7","ner_tags":["B-LOC","O","O","O","O","O"],"tokens":["Plymouth",",","Thursday","Evenhji",";","."]} +{"id":"8199709_4","ner_tags":["O","O","O","O","O","O","O","O","O","B-STREET","O","O","B-BUILDING","I-BUILDING","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-STREET","O","B-LOC","O","O","O","O","O","O"],"tokens":["Messrs","molloy",",","surgeon","dentists",",","of","112",",","Rusholme-road","(","near","All","Saints",")",",","Manchester",",","rm-y","be","consulted","every","Saturday",",","from","Ten","till","Five","o","\u2019","clock",",","at","931","Stamford-slreet",",","Ashton",",","tee","residence","of","Mr","."]} +{"id":"4939308_3","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","B-STREET","O","O","O"],"tokens":["\u201d","*","4","During","the","last","year","two","shops","have","been","erected","in","Melbourne-street","by","Mr","."]} +{"id":"3691199_4","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","Bishop","of","Durham",",","the","Bishop","of","Winchester",",","the","Bishop","of","Gloucester","and","Bristol",",","the","Bishop","of","Hereford",",","the","Bishop","of","Bangor",",","the","Dean","of","Norwich",",","the","Rev","."]} +{"id":"4938614_5","ner_tags":["O","O"],"tokens":["Mr","."]} +{"id":"3938653_11","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-BUILDING","I-BUILDING","I-BUILDING","O","B-LOC","O"],"tokens":["After","dinner","the","Prince","adjourned","with","the","company","to","a","grand","ball","given","by","the","united","services","at","the","Royal","William","Yard",",","Stonehouse","."]} +{"id":"4939308_19","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Joseph","Turner","proposed","\u201c","The","Prince","and","Princess","of","Wales",",","and","the","rest","of","the","royal","family",",","\u201d","after","which","Mr","."]} +{"id":"3938653_3","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","Qaeen",",","accompanied","by","Princess","Helena","and","Prince","Albert","Victor",",","drove","oat","in","the","grounds","yesterday","morning","Princess","Helena","rode","on","horseback","in","the","afternoon",",","attended","by","the","Hon","."]} +{"id":"3580760_17","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","inns","alone","have","benefited","by","the","presence","of","foreign","travellers",",","and","of","those","who","have","come","to","our","town","through","business","."]} +{"id":"3691199_10","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","money","granted","had","been","\u00a321,872",",","being","\u00a3700","above","any","former","year","."]} +{"id":"3896074_21","ner_tags":["O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Mourn","him","who","to","the","conscript","gave","HeHrt-broken","France","a","prey",";","And","sill","could","outraged","nature","brave","With","homicidal","sway","!"]} +{"id":"12670_4","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","I-LOC","O"],"tokens":["\u2014","This","paper","which","is","published","every","Tuesday",",","Thursday",",","and","Saturday","evening",",","contains",",","in","addition","to","ectracts","from","the","British","and","Foreign","newspapers","of","articles","of","political","and","general","intelligence",",","copious","and","carefully-selected","information","as","to","all","naval",",","military",",","and","commercial","affairs",",","and","especially","that","which","relates","to","the","trade","and","commerce","of","the","Channel","Islands","."]} +{"id":"3896074_12","ner_tags":["O","O","O"],"tokens":["\u2014","Man","."]} +{"id":"12275_13","ner_tags":["O","O","O","O"],"tokens":["SHIPPING","INTELLIGENCr",":","."]} +{"id":"12275_9","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Timson","accepted","the","mastership","at","a","time","when","no","other","gentleman","could","be","prevailed","upon","to","do","so",";","it","is","therefore","to","be","hoped","that","the","members","of","the","hunt","will","testify","their","feeling","of","gratitude","and","respect","either","in","the","shape","of","a","testimonial","or","farewell","banquet","."]} +{"id":"8263166_4","ner_tags":["O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","deficiency","of","Turnips","is","severely","felt","in","Norfolk",",","and","odier","counties",",","where","thev","depend","so","much","upon","them","to","top","their","spring","beasts",".","1","n","the","Northern","districts","they","are","generally","good","Coleseed","for","a","crop","is","doing","well","."]} +{"id":"3896074_9","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["While","startled","bones","of","millions","rise","All","ghastly","as","they","stood","When","thunderous","battle","mock","d","the","skies",",","And","lained","down","human","blood","!"]} +{"id":"10734579_1","ner_tags":["O","O"],"tokens":["NOTICE","."]} +{"id":"8199709_11","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Tneir","WHITE","ENAMEL","restores","black","and","decayed","teeth","to","tneir","original","whiteness",",","prevents","toothache",",","and","makes","a","hollow","tooth","sound","and","useful","for","many","years","."]} +{"id":"4939308_14","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Another","important","matter","ip","connection","with","houses","was","baths",",","without","which","no","home","could","be","considered","complete","."]} +{"id":"3580760_10","ner_tags":["O","O","O","O","O","O","O","O","O","O"],"tokens":["We","drove","the","Miguelites","betore","us","from","all","parts","."]} +{"id":"3938653_5","ner_tags":["O","O"],"tokens":["Lascelles","."]} +{"id":"3896074_16","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["To","Cossack","hate","consigned",":","\u2014","Famine","and","frozen","wastes","before",",","Bones","and","the","wolf","behind","."]} +{"id":"10734579_17","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["When","the","whole","of","the","universe","rings","with","his","fame","1","So","arc","Turner","and","Wellinoion","famous","afar",",","One","the","hero","of","Blacking",",","the","other","of","War","!","!"]} +{"id":"4939308_21","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["He","said","Englishmen","were","proud","of","their","army","and","navy",",","and","often","referred","to","their","deeds","in","the","most","patriotic","language","."]} +{"id":"10734579_9","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["For",",","without","second","sight",",","I","can","prophecy","soon",",","That","you","will","see","it","adopted","by","Foot","and","Dragoon",";","For","you","scarcely","can","tell",",","when","drawn","up","on","parade",",","Which","glitters","the","brightest",",","the","boot","or","the","blade","."]} +{"id":"12670_1","ner_tags":["O","O"],"tokens":["NOTICE","."]} +{"id":"10734579_16","ner_tags":["O","O","O","O","O","O","O","O","O","O"],"tokens":["Who","do","not","feel","pride","in","a","Wellingtons","name","."]} +{"id":"8199709_13","ner_tags":["O","O","O","O","O","O","B-STREET","O","B-LOC"],"tokens":["Attendance","every","Saturday","at","231",",","Stamford-street",",","Ashton"]} +{"id":"10734579_14","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Then","your","genius","will","shine","as","well","as","your","boot",".","1","hough","Davy","and","Home","have","astonished","the","world","."]} +{"id":"3580760_13","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Santa","Martha","commanded","in","person","against","my","part","of","the","line",",","and","had","three","regiments","of","the","line",",","one","of","volunteers",",","and","one","of","militia",",","against","200","British","and","150","Portuguese",".","1","lost","one","officer",",","Colonel","Staunton",",","one","officer","wounded",",","two","men","killed",",","and","17","wounded",".","\""]} +{"id":"3580760_15","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["No","important","transactions","have","taken","place","as","yet","in","any","article","."]} +{"id":"4939308_1","ner_tags":["O","B-LOC","O","O"],"tokens":["THE","STALYBRIDGE","\u201c","EMPORIUM"]} +{"id":"4939308_27","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["He","referred","to","bad","trade","in","general",",","and","to","that","ef","the","bmild","*"]} +{"id":"4939308_9","ner_tags":["O","O","O","O","O"],"tokens":["Haigh","France","the","vice-chair","."]} +{"id":"4939308_4","ner_tags":["O","O","O","O","O","O","O","O","O","O"],"tokens":["Napoleon","Ives",",","who","give","them","the","above","name","."]} +{"id":"3580760_4","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["\u2014","The","consol","market","opened","this","morning","at","84A","lor","the","account",",","but","has","since","been","heavy",",","fluctuating","between","83","|","and","84",",","and","has","now","declined","to","83","\u00a3","for","money",",","and","83","$","J","for","the","account","."]} +{"id":"3938653_1","ner_tags":["O","O","O","O","O"],"tokens":["THE","COURT","AND","GOVERNMENT","."]} +{"id":"4939308_20","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Chabnock","gave",",","with","an","appropriate","address",",","\u201c","The","Army",",","Navy",",","and","Volunteers",".","\u201d"]} +{"id":"8199709_10","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["By","theuseof","this","perfectly","incorrodible",",","light",",","and","flexible","material",",","from","one","to","a","com","pie","e","set","of","aitifioial","teeth","may","be","worn","with","the","greatest","comfort",",","perfo-ming","all","the","functions","of","natu-ai","teeth","without","causing","the","slightest","pain","or","inconvenience",",","and","can","be","removed","and","replaced","with","the","greatest","ease","."]} +{"id":"12670_8","ner_tags":["O","O","O","B-LOC","I-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Subscribers","in","the","United","Kingdom","will","be","charged","on","the","following","scale",",","in","British","money",",","including","pre-payment","and","posting",",","payable","in","advance",":","For","three","numbers","per","week",".",".",".",".9s",".","9d",".","per","quarter","For","two","do",".","do",".",".",".",".",".","Bs","."]} +{"id":"12275_1","ner_tags":["B-LOC","I-LOC","I-LOC","I-LOC","O"],"tokens":["LYND","H","UR","ST","."]} +{"id":"8263166_10","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Hops","continue","low","and","dull","in","silo",",","and","the","Wool","market","is","rather","more","brisk","for","both","long","and","short","fleeces",";","but","with","little","variation","from","the","last","mouths","prices","."]} +{"id":"3938653_14","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","right","hon",".","gentleman","is","still","confined","to","his","chamber","."]} +{"id":"3580760_3","ner_tags":["B-LOC","O","O","O","O"],"tokens":["City",",","Twelve","oClock","."]} +{"id":"3580760_18","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Notwithstanding","the","sad","state","of","our","trade","in","this","circumstance",",","it","cannot","be","said","to","be","more","wretched","than","that","of","Offenbach",",","only","as","our","merchants","transport","their","goods","from","that","town",",","when","they","effect","the","sale","personally",",","the","principal","mass","of","money","entering",",","consequently",",","into","their","coffers",",","gave","a","little","more","animation","to","the","fair","."]} +{"id":"8263166_5","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Winter","Tares","and","Rye",".","for","early","feed",",","in","most","parts","look","kindly","."]} +{"id":"3580760_16","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["We","no","longer","observe","that","spirit","of","activity","which","used","to","reign","in","our","streets",",","especially","at","harvest","time","."]} +{"id":"3691199_14","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["AmjtJgst","their","contributions","was","the","munificent","sum","of","18,000","from","Mr","."]} +{"id":"3938653_2","ner_tags":["B-LOC","O","O","O"],"tokens":["OSEORNE",",","THURSDA","."]} +{"id":"3691199_15","ner_tags":["O","O","O","O","B-STREET","O","B-LOC","O"],"tokens":["George","Davenport",",","of","Lime-street",",","London","."]} +{"id":"3580760_14","ner_tags":["B-LOC","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Frankfort","Fair\u2014Sept",".","14","\u2014","The","first","week","of","our","corn","fair","this","year","has","not","otTered","any","satisfactory","result",",","and","we","have","to","add","thatour","autumn","crop","must","be","reckoned","among","the","worst","that","we","have","yet","had","."]} +{"id":"3896074_3","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["With","poetrys","pure","breath",",","The","icatterer","of","the","human","race","!"]} +{"id":"3691199_8","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["All","the","returns","had","not","yet","been","received",",","but","those","which","had",",","had","advised","them","of","contributions","to","the","amount","of","\u00a334,000",",","which","exceeded","the","collection","under","the","same","authority","in","1834",",","by","\u00a33,000",",","and","which","was","still","more","gratifying",",","as",",","during","the","last","year",",","\u00a3116,000","had","been","subscribed","for","the","erection","of","additional","churches","in","the","metropolis","."]} +{"id":"3580760_12","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Never","was","there","any","thing","more","brave","than","their","repulsing","the","enemy","from","the","heights",".","they","actually","fled","before","us",",","and","lost","one","field","officer","and","50","men","."]} +{"id":"12670_9","ner_tags":["O","O","O","O"],"tokens":["Bd",".","do","."]} +{"id":"10734579_6","ner_tags":["O","O","O","O","O","O","O","O","O","O"],"tokens":["Why","the","praises","of","Turners","Japan","should","I","tell","."]} +{"id":"8263166_1","ner_tags":["O","O","O","O"],"tokens":["MONTHLY","AGRICULTURAL","REPORT","."]} +{"id":"8199709_6","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["They","continue","to","suppy","their","unrivalled","MINERAL","TEETH","and","ARTIFICIAL","GUMS",",","which","restore","both","the","appearance","of","natural","teeth","and","their","usefulness","in","mastication","."]} +{"id":"3580760_22","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Their","progressive","increase","has","engaged","our","Senate","to","take","measures","for","the","improvement","of","the","asylums","which","are","destined","for","them",",","for","the","keeping","up","of","which","the","subscriptions","of","the","philanthropic","citizens","are","no","Longer","sufficient","Suabian","Mercury",","]} +{"id":"3580760_11","ner_tags":["O","O","O","O","O","O","O","O","O"],"tokens":["My","little","British","band","crowned","themselves","with","glory","."]} +{"id":"4939308_10","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["The","Chairman",",","in","his","opening","remarks",",","said","they","had","assembled","in","accordance","with","a","good","old","custom",",","which","brought","together","all","who","had","been","engaged","in","the","erection","of","buildings","."]} +{"id":"4939308_16","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["In","conclusion",",","he","proposed","the","toast","of","\u201c","The","Queen",",","\u201d","and","said","he","hoped","she","might","live","long","to","reign","over","them",",","and","that","her","future","might","be","as","bright","if","not","more","brilliant","than","the","past","."]} +{"id":"3896074_1","ner_tags":["O","O","O"],"tokens":["A","REMONSTRANCE","."]} +{"id":"4939308_2","ner_tags":["O"],"tokens":["."]} +{"id":"4938614_7","ner_tags":["O","O","O","O","O"],"tokens":["Accident","in","a","Factory","."]} +{"id":"12275_4","ner_tags":["O","O","O","O","O","O","O","O","O","B-LOC","I-LOC","O","O","O","O","O","O","O","O","B-LOC","O","O","B-LOC","O","O","O","O","O","O","O","O","B-LOC","I-LOC","O","O","O","O","B-LOC","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["%","here","he","turned","to","the","right","and","crossed","Church","Moor",",","thence","at","a","clipping","pace",",","to","Burley",";","leaving","Burley","to","the","left",",","he","pushed","forward","through","Beech","Beds","to","the","earths","at","Oakley",",","where",",","to","his","aurprse",",","he","found","no","admission","."]} +{"id":"4939308_17","ner_tags":["O","O","O","O","O","O","O","O"],"tokens":["The","toast","was","received","with","musical","honours","."]} +{"id":"8199709_2","ner_tags":["O","O","O","O","O","O","O"],"tokens":["\u2014","REGULAR","WEEKLY","ATTENDANCE",",","TEETH","."]} +{"id":"3938653_13","ner_tags":["O","O","O","O","O","O","O","O","O","O","O","O","O","O","O","O"],"tokens":["Frederick","Peel","ia","in","improved","health",",","and","has","been","gradually","getting","better","since","Tuesday","."]} diff --git a/tests/sample_files/experiments/outputs/data/lwm/rel_dev.json b/tests/sample_files/experiments/outputs/data/lwm/rel_dev.json new file mode 100644 index 00000000..57526d2f --- /dev/null +++ b/tests/sample_files/experiments/outputs/data/lwm/rel_dev.json @@ -0,0 +1 @@ +{"12670_4": [{"mention": "Channel Islands", "sent_idx": 4, "sentence": "\u2014This paper which is published every Tuesday, Thursday, and Saturday evening, contains, in addition to ectracts from the British and Foreign newspapers of articles of political and general intelligence, copious and carefully-selected information as to all naval, military, and commercial affairs, and especially that which relates to the trade and commerce of the Channel Islands. ", "ngram": "Channel Islands", "context": ["Established in 1813.", "The following are the subscription prices to The Star, payable quarterly :-For three numbers per week For two do. do. "], "pos": 364, "end_pos": 379, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q42314"]}, {"mention": "Channel Islands", "sent_idx": 4, "sentence": "\u2014This paper which is published every Tuesday, Thursday, and Saturday evening, contains, in addition to ectracts from the British and Foreign newspapers of articles of political and general intelligence, copious and carefully-selected information as to all naval, military, and commercial affairs, and especially that which relates to the trade and commerce of the Channel Islands. ", "ngram": "Channel Islands", "context": ["Established in 1813.", "The following are the subscription prices to The Star, payable quarterly :-For three numbers per week For two do. do. "], "pos": 364, "end_pos": 379, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q42314"]}], "12670_8": [{"mention": "United Kingdom", "sent_idx": 8, "sentence": "Subscribers in the United Kingdom will be charged on the following scale, in British money, including pre-payment and posting, payable in advance : For three numbers per week....9s. 9d. per quarter For two do. do. ....Bs. ", "ngram": "United Kingdom", "context": ["Single numbers, 2d. each. ", "Bd. do. "], "pos": 19, "end_pos": 33, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q145"]}, {"mention": "United Kingdom", "sent_idx": 8, "sentence": "Subscribers in the United Kingdom will be charged on the following scale, in British money, including pre-payment and posting, payable in advance : For three numbers per week....9s. 9d. per quarter For two do. do. ....Bs. ", "ngram": "United Kingdom", "context": ["Single numbers, 2d. each. ", "Bd. do. "], "pos": 19, "end_pos": 33, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q145"]}], "12670_11": [{"mention": "Bordage-Street", "sent_idx": 11, "sentence": "The Star (or three numbers folded together) can be forwarded by post, to any part of the United Kingdom or France, on allicing a postage stamp of one PENNI: sir Address\u2014No. 10, Bordage-Street, Guernsey.", "ngram": "Bordage-Street", "context": ["For one do. do. ....3s. 4d. do. ", ""], "pos": 177, "end_pos": 191, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "STREET", "gold": "NIL"}, {"mention": "Guernsey", "sent_idx": 11, "sentence": "The Star (or three numbers folded together) can be forwarded by post, to any part of the United Kingdom or France, on allicing a postage stamp of one PENNI: sir Address\u2014No. 10, Bordage-Street, Guernsey.", "ngram": "Guernsey", "context": ["For one do. do. ....3s. 4d. do. ", ""], "pos": 193, "end_pos": 201, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q3311985"]}, {"mention": "Bordage-Street", "sent_idx": 11, "sentence": "The Star (or three numbers folded together) can be forwarded by post, to any part of the United Kingdom or France, on allicing a postage stamp of one PENNI: sir Address\u2014No. 10, Bordage-Street, Guernsey.", "ngram": "Bordage-Street", "context": ["For one do. do. ....3s. 4d. do. ", ""], "pos": 177, "end_pos": 191, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "STREET", "gold": "NIL"}, {"mention": "Guernsey", "sent_idx": 11, "sentence": "The Star (or three numbers folded together) can be forwarded by post, to any part of the United Kingdom or France, on allicing a postage stamp of one PENNI: sir Address\u2014No. 10, Bordage-Street, Guernsey.", "ngram": "Guernsey", "context": ["For one do. do. ....3s. 4d. do. ", ""], "pos": 193, "end_pos": 201, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q3311985"]}]} \ No newline at end of file diff --git a/tests/sample_files/experiments/outputs/data/lwm/rel_train.json b/tests/sample_files/experiments/outputs/data/lwm/rel_train.json new file mode 100644 index 00000000..92e0cf03 --- /dev/null +++ b/tests/sample_files/experiments/outputs/data/lwm/rel_train.json @@ -0,0 +1 @@ +{"3938653_2": [{"mention": "OSEORNE", "sent_idx": 2, "sentence": "OSEORNE, THURSDA . ", "ngram": "OSEORNE", "context": ["THE COURT AND GOVERNMENT. ", "The Qaeen, accompanied by Princess Helena and Prince Albert Victor, drove oat in the grounds yesterday morning Princess Helena rode on horseback in the afternoon, attended by the Hon. "], "pos": 0, "end_pos": 7, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": "NIL"}], "3938653_7": [{"mention": "Plymouth", "sent_idx": 7, "sentence": "Plymouth, Thursday Evenhji;. ", "ngram": "Plymouth", "context": ["THE PRINCE AND PRINCESS OF WALES. ", "A dejeuner was given to-day at Mount Edgecumbe to about 100 of the elite of the neighbourhood, invited to meet the Prince and Princess of Wales. "], "pos": 0, "end_pos": 8, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q43382"]}], "3938653_8": [{"mention": "Mount Edgecumbe", "sent_idx": 8, "sentence": "A dejeuner was given to-day at Mount Edgecumbe to about 100 of the elite of the neighbourhood, invited to meet the Prince and Princess of Wales. ", "ngram": "Mount Edgecumbe", "context": ["Plymouth, Thursday Evenhji;. ", "In the afternoon their Royal Highnesses went a short yachting excursion in the Earl of Edgecunibes yacht; and in the evening the Prince dined with Viscount Templetown, Commander-inChief of the Western District, at the Government House, Mo ant Wise. "], "pos": 31, "end_pos": 46, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "BUILDING", "gold": ["Q6920546"]}], "3938653_9": [{"mention": "Government House", "sent_idx": 9, "sentence": "In the afternoon their Royal Highnesses went a short yachting excursion in the Earl of Edgecunibes yacht; and in the evening the Prince dined with Viscount Templetown, Commander-inChief of the Western District, at the Government House, Mo ant Wise. ", "ngram": "Government House", "context": ["A dejeuner was given to-day at Mount Edgecumbe to about 100 of the elite of the neighbourhood, invited to meet the Prince and Princess of Wales. ", "The party consisted principally of the army and navy officers in commission at the port, andthe officers \u2022f the French squadron lying in the Sound. "], "pos": 218, "end_pos": 234, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "BUILDING", "gold": "NIL"}, {"mention": "Mo ant Wise", "sent_idx": 9, "sentence": "In the afternoon their Royal Highnesses went a short yachting excursion in the Earl of Edgecunibes yacht; and in the evening the Prince dined with Viscount Templetown, Commander-inChief of the Western District, at the Government House, Mo ant Wise. ", "ngram": "Mo ant Wise", "context": ["A dejeuner was given to-day at Mount Edgecumbe to about 100 of the elite of the neighbourhood, invited to meet the Prince and Princess of Wales. ", "The party consisted principally of the army and navy officers in commission at the port, andthe officers \u2022f the French squadron lying in the Sound. "], "pos": 236, "end_pos": 247, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "BUILDING", "gold": ["Q14912878"]}], "3938653_10": [{"mention": "Sound", "sent_idx": 10, "sentence": "The party consisted principally of the army and navy officers in commission at the port, andthe officers \u2022f the French squadron lying in the Sound. ", "ngram": "Sound", "context": ["In the afternoon their Royal Highnesses went a short yachting excursion in the Earl of Edgecunibes yacht; and in the evening the Prince dined with Viscount Templetown, Commander-inChief of the Western District, at the Government House, Mo ant Wise. ", "After dinner the Prince adjourned with the company to a grand ball given by the united services at the Royal William Yard, Stonehouse. "], "pos": 141, "end_pos": 146, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q2333061"]}], "3938653_11": [{"mention": "Royal William Yard", "sent_idx": 11, "sentence": "After dinner the Prince adjourned with the company to a grand ball given by the united services at the Royal William Yard, Stonehouse. ", "ngram": "Royal William Yard", "context": ["The party consisted principally of the army and navy officers in commission at the port, andthe officers \u2022f the French squadron lying in the Sound. ", "Mr. "], "pos": 103, "end_pos": 121, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "BUILDING", "gold": ["Q7375014"]}, {"mention": "Stonehouse", "sent_idx": 11, "sentence": "After dinner the Prince adjourned with the company to a grand ball given by the united services at the Royal William Yard, Stonehouse. ", "ngram": "Stonehouse", "context": ["The party consisted principally of the army and navy officers in commission at the port, andthe officers \u2022f the French squadron lying in the Sound. ", "Mr. "], "pos": 123, "end_pos": 133, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q7619235"]}], "4938614_1": [{"mention": "DUKINFIELD", "sent_idx": 1, "sentence": "DUKINFIELD. ", "ngram": "DUKINFIELD", "context": ["", "Knutsford Sessions."], "pos": 0, "end_pos": 10, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q1976179"]}], "4938614_2": [{"mention": "Knutsford", "sent_idx": 2, "sentence": "Knutsford Sessions.", "ngram": "Knutsford", "context": ["DUKINFIELD. ", "\u2014The servant girl, Eliza Ann Byrom, who stole a quantity of clothes from the house where she lodged, in Dukiafield, was sentenced to two months\u2019 imprisonment. "], "pos": 0, "end_pos": 9, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q1470791"]}], "4938614_3": [{"mention": "Dukiafield", "sent_idx": 3, "sentence": "\u2014The servant girl, Eliza Ann Byrom, who stole a quantity of clothes from the house where she lodged, in Dukiafield, was sentenced to two months\u2019 imprisonment. ", "ngram": "Dukiafield", "context": ["Knutsford Sessions.", "Martha Wilde, who was sent from the Dukinfield court for obtaining money under false pretences by representing at two pawnshops in Dukinfield that a spurious composition called coraline beads were real coral was discharged. "], "pos": 104, "end_pos": 114, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q1976179"]}], "4938614_4": [{"mention": "Dukinfield", "sent_idx": 4, "sentence": "Martha Wilde, who was sent from the Dukinfield court for obtaining money under false pretences by representing at two pawnshops in Dukinfield that a spurious composition called coraline beads were real coral was discharged. ", "ngram": "Dukinfield", "context": ["\u2014The servant girl, Eliza Ann Byrom, who stole a quantity of clothes from the house where she lodged, in Dukiafield, was sentenced to two months\u2019 imprisonment. ", "Mr. "], "pos": 36, "end_pos": 46, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q1976179"]}, {"mention": "Dukinfield", "sent_idx": 4, "sentence": "Martha Wilde, who was sent from the Dukinfield court for obtaining money under false pretences by representing at two pawnshops in Dukinfield that a spurious composition called coraline beads were real coral was discharged. ", "ngram": "Dukinfield", "context": ["\u2014The servant girl, Eliza Ann Byrom, who stole a quantity of clothes from the house where she lodged, in Dukiafield, was sentenced to two months\u2019 imprisonment. ", "Mr. "], "pos": 131, "end_pos": 141, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q1976179"]}], "4938614_8": [{"mention": "Mr. Chadwick\u2019s factory", "sent_idx": 8, "sentence": "\u2014Whilst a boy named Edwin _ Diggle, 14 years of age, a pieoer at Mr. Chadwick\u2019s factory", "ngram": "Mr. Chadwick\u2019s factory", "context": ["Accident in a Factory.", ". "], "pos": 65, "end_pos": 87, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "BUILDING", "gold": "NIL"}], "4939308_1": [{"mention": "STALYBRIDGE", "sent_idx": 1, "sentence": "THE STALYBRIDGE \u201cEMPORIUM", "ngram": "STALYBRIDGE", "context": ["", "."], "pos": 4, "end_pos": 15, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q1398653"]}], "4939308_3": [{"mention": "Melbourne-street", "sent_idx": 3, "sentence": "\u201d * 4 During the last year two shops have been erected in Melbourne-street by Mr. ", "ngram": "Melbourne-street", "context": [".", "Napoleon Ives, who give them the above name. "], "pos": 59, "end_pos": 75, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "STREET", "gold": "NIL"}], "4939308_6": [{"mention": "Dog and Partridge", "sent_idx": 6, "sentence": "Bray, Dog and Partridge, Market-street. ", "ngram": "Dog and Partridge", "context": ["On Thursday evening the workmen employed thereon, and others, to the number of forty, sat dowu to a spread of geese, turkey, and other good things, at the house of Mr. ", "Afterwards Mr. "], "pos": 6, "end_pos": 23, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "BUILDING", "gold": "NIL"}, {"mention": "Market-street", "sent_idx": 6, "sentence": "Bray, Dog and Partridge, Market-street. ", "ngram": "Market-street", "context": ["On Thursday evening the workmen employed thereon, and others, to the number of forty, sat dowu to a spread of geese, turkey, and other good things, at the house of Mr. ", "Afterwards Mr. "], "pos": 25, "end_pos": 38, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "STREET", "gold": "NIL"}], "4939308_25": [{"mention": "Stalybridge", "sent_idx": 25, "sentence": "Mr, Henbt Jeffreys proposed \u201cProsperity to the town and trade of Stalybridge, with a few appropriate remarks. ", "ngram": "Stalybridge", "context": ["The toast was well received, the Chairman speaking very highly of the volunteers of the country, and pointing out that, if properly managed, they would very much tend to a decrease in the annual military and naval expenditure of the country. ", "After it had been duly drunk, the Chairman responded. "], "pos": 65, "end_pos": 76, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q1398653"]}], "8199709_4": [{"mention": "Rusholme-road", "sent_idx": 4, "sentence": "Messrs molloy,surgeon dentists, of 112, Rusholme-road (near All Saints), Manchester, rm-y be consulted every Saturday, from Ten till Five o\u2019clock, at 931 Stamford-slreet, Ashton, tee residence of Mr. ", "ngram": "Rusholme-road", "context": ["TEETH. ", "Bostock, Chemist. "], "pos": 40, "end_pos": 53, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "STREET", "gold": "NIL"}, {"mention": "All Saints", "sent_idx": 4, "sentence": "Messrs molloy,surgeon dentists, of 112, Rusholme-road (near All Saints), Manchester, rm-y be consulted every Saturday, from Ten till Five o\u2019clock, at 931 Stamford-slreet, Ashton, tee residence of Mr. ", "ngram": "All Saints", "context": ["TEETH. ", "Bostock, Chemist. "], "pos": 60, "end_pos": 70, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "BUILDING", "gold": "NIL"}, {"mention": "Manchester", "sent_idx": 4, "sentence": "Messrs molloy,surgeon dentists, of 112, Rusholme-road (near All Saints), Manchester, rm-y be consulted every Saturday, from Ten till Five o\u2019clock, at 931 Stamford-slreet, Ashton, tee residence of Mr. ", "ngram": "Manchester", "context": ["TEETH. ", "Bostock, Chemist. "], "pos": 73, "end_pos": 83, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q18125"]}, {"mention": "Stamford-slreet", "sent_idx": 4, "sentence": "Messrs molloy,surgeon dentists, of 112, Rusholme-road (near All Saints), Manchester, rm-y be consulted every Saturday, from Ten till Five o\u2019clock, at 931 Stamford-slreet, Ashton, tee residence of Mr. ", "ngram": "Stamford-slreet", "context": ["TEETH. ", "Bostock, Chemist. "], "pos": 154, "end_pos": 169, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "STREET", "gold": "NIL"}, {"mention": "Ashton", "sent_idx": 4, "sentence": "Messrs molloy,surgeon dentists, of 112, Rusholme-road (near All Saints), Manchester, rm-y be consulted every Saturday, from Ten till Five o\u2019clock, at 931 Stamford-slreet, Ashton, tee residence of Mr. ", "ngram": "Ashton", "context": ["TEETH. ", "Bostock, Chemist. "], "pos": 171, "end_pos": 177, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q659803"]}], "8199709_13": [{"mention": "Stamford-street", "sent_idx": 13, "sentence": "Attendance every Saturday at 231, Stamford-street, Ashton", "ngram": "Stamford-street", "context": ["Terms\u2014A single teoth from ss. ", ""], "pos": 34, "end_pos": 49, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "STREET", "gold": "NIL"}, {"mention": "Ashton", "sent_idx": 13, "sentence": "Attendance every Saturday at 231, Stamford-street, Ashton", "ngram": "Ashton", "context": ["Terms\u2014A single teoth from ss. ", ""], "pos": 51, "end_pos": 57, "place": "Ashton-under-Lyne", "place_wqid": "Q659803", "candidates": [], "ner_label": "LOC", "gold": ["Q659803"]}], "3580760_2": [{"mention": "LONDON", "sent_idx": 2, "sentence": "LONDON, THURSDAY, SEPTEMBER 27. ", "ngram": "LONDON", "context": ["Postscript. ", "City, Twelve oClock."], "pos": 0, "end_pos": 6, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q84"]}], "3580760_3": [{"mention": "City", "sent_idx": 3, "sentence": "City, Twelve oClock.", "ngram": "City", "context": ["LONDON, THURSDAY, SEPTEMBER 27. ", "\u2014The consol market opened this morning at 84A lor the account, but has since been heavy, fluctuating between 83| and 84, and has now declined to 83\u00a3 for money, and 83$ J for the account. "], "pos": 0, "end_pos": 4, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q23311"]}], "3580760_6": [{"mention": "Oporto", "sent_idx": 6, "sentence": "By the Romona, steam boat, we have received accounts from Oporto to the 21st; and from the fleet, which was in latitude 37. 21., longitude 11. 37., to the 18th instant. ", "ngram": "Oporto", "context": ["Four oJClock\u2014Consols for Account, 83J. ", "The two fleets remained in sight of each other, and it was expected would come to an engagement as soon as Sartorius had been joined by the vessels which he was then expecting at Oporto. "], "pos": 58, "end_pos": 64, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q36433"]}], "3580760_7": [{"mention": "Oporto", "sent_idx": 7, "sentence": "The two fleets remained in sight of each other, and it was expected would come to an engagement as soon as Sartorius had been joined by the vessels which he was then expecting at Oporto. ", "ngram": "Oporto", "context": ["By the Romona, steam boat, we have received accounts from Oporto to the 21st; and from the fleet, which was in latitude 37. 21., longitude 11. 37., to the 18th instant. ", "Affairs remained in nearly the same state as when the last accounts left. "], "pos": 179, "end_pos": 185, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q36433"]}], "3580760_14": [{"mention": "Frankfort", "sent_idx": 14, "sentence": "Frankfort Fair\u2014Sept. 14\u2014The first week of our corn fair this year has not otTered any satisfactory result, and we have to add thatour autumn crop must be reckoned among the worst that we have yet had. ", "ngram": "Frankfort", "context": ["Santa Martha commanded in person against my part of the line, and had three regiments of the line, one of volunteers, and one of militia, against 200 British and 150 Portuguese. 1 lost one officer, Colonel Staunton, one officer wounded, two men killed, and 17 wounded.\" ", "No important transactions have taken place as yet in any article. "], "pos": 0, "end_pos": 9, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q1794"]}], "3580760_18": [{"mention": "Offenbach", "sent_idx": 18, "sentence": "Notwithstanding the sad state of our trade in this circumstance, it cannot be said to be more wretched than that of Offenbach, only as our merchants transport their goods from that town, when they effect the sale personally, the principal mass of money entering, consequently, into their coffers, gave a little more animation to the fair. ", "ngram": "Offenbach", "context": ["The inns alone have benefited by the presence of foreign travellers, and of those who have come to our town through business. ", "Smuggling is going on actively; it is even said that there are companies who insure that kind of speculation. "], "pos": 116, "end_pos": 125, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q3042"]}], "3580760_20": [{"mention": "Offenbach", "sent_idx": 20, "sentence": "The value of houses and other fixtures decreases considerably in our town, while it rises at Offenbach. ", "ngram": "Offenbach", "context": ["Smuggling is going on actively; it is even said that there are companies who insure that kind of speculation. ", "The number of poor increases daily, as well on account of the bad harvest as of the exorbitant taxes which they are obliged to pay to the customs for the introduction of their merchandize. "], "pos": 93, "end_pos": 102, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q3042"]}], "8263166_4": [{"mention": "Norfolk", "sent_idx": 4, "sentence": "The deficiency of Turnips is severely felt in Norfolk, and odier counties, where thev depend so much upon them to top their spring beasts. 1 n the Northern districts they are generally good Coleseed for a crop is doing well. ", "ngram": "Norfolk", "context": ["The markets for all kindsof Com continue depressed. ", "Winter Tares and Rye. for early feed, in most parts look kindly. "], "pos": 46, "end_pos": 53, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q23109"]}], "10734579_3": [{"mention": "Dorset", "sent_idx": 3, "sentence": "This invaluable composition may behad at the Dorset County Chronicle Office, and of all the respectable shop, keepers in Dorchester and its vicinity, in stone bottles, at 6u.t Is., and 18d. each. ", "ngram": "Dorset", "context": ["\u2014How delightful to see a Lady or Gentleman\u2019s beautiful black shining Hoot or Shoe reflecting every surrounding object in TURNER\u2019S UNRIVALLED BLACKING. ", "A CH A t.I.r.NG R. 1 challenge the world to produce, if it can, A Blacking that\u2019s equal to Turner\u2019s Japan : For like crystal it shines, while it softens the leather. "], "pos": 45, "end_pos": 51, "place": "Dorchester", "place_wqid": "Q503331", "candidates": [], "ner_label": "LOC", "gold": ["Q23159"]}, {"mention": "Dorchester", "sent_idx": 3, "sentence": "This invaluable composition may behad at the Dorset County Chronicle Office, and of all the respectable shop, keepers in Dorchester and its vicinity, in stone bottles, at 6u.t Is., and 18d. each. ", "ngram": "Dorchester", "context": ["\u2014How delightful to see a Lady or Gentleman\u2019s beautiful black shining Hoot or Shoe reflecting every surrounding object in TURNER\u2019S UNRIVALLED BLACKING. ", "A CH A t.I.r.NG R. 1 challenge the world to produce, if it can, A Blacking that\u2019s equal to Turner\u2019s Japan : For like crystal it shines, while it softens the leather. "], "pos": 121, "end_pos": 131, "place": "Dorchester", "place_wqid": "Q503331", "candidates": [], "ner_label": "LOC", "gold": ["Q503331"]}], "3896074_13": [{"mention": "Austerlitz", "sent_idx": 13, "sentence": "Another Austerlitz demand, Another Jena claim; And desolate the groaning lan 1 To write one despots fame ! ", "ngram": "Austerlitz", "context": ["\u2014Man. ", "Let cen the pyramids afford A verse for his renown : But speak not of brave Sidneys sword, That swept the invader down I Sing not of Moscows flaming tide\u2014 The fiery brands which hurled The chainer of the nations wide, The scourfer of the world. "], "pos": 8, "end_pos": 18, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "OTHER", "gold": ["Q134114"]}, {"mention": "Jena", "sent_idx": 13, "sentence": "Another Austerlitz demand, Another Jena claim; And desolate the groaning lan 1 To write one despots fame ! ", "ngram": "Jena", "context": ["\u2014Man. ", "Let cen the pyramids afford A verse for his renown : But speak not of brave Sidneys sword, That swept the invader down I Sing not of Moscows flaming tide\u2014 The fiery brands which hurled The chainer of the nations wide, The scourfer of the world. "], "pos": 35, "end_pos": 39, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q3150"]}], "3896074_14": [{"mention": "Moscows", "sent_idx": 14, "sentence": "Let cen the pyramids afford A verse for his renown : But speak not of brave Sidneys sword, That swept the invader down I Sing not of Moscows flaming tide\u2014 The fiery brands which hurled The chainer of the nations wide, The scourfer of the world. ", "ngram": "Moscows", "context": ["Another Austerlitz demand, Another Jena claim; And desolate the groaning lan 1 To write one despots fame ! ", "With rout and havoc from their shore ! "], "pos": 133, "end_pos": 140, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q649"]}], "3896074_17": [{"mention": "Trafalgars", "sent_idx": 17, "sentence": "Let net Trafalgars chief, who died, A moments thought beguile; Nor laud with British heart and pride The Baltic and the Nile! ", "ngram": "Trafalgars", "context": ["To Cossack hate consigned :\u2014 Famine and frozen wastes before, Bones and the wolf behind. ", "Leave Nelsons glorious flag unsung, And Wellington unwreathed; Their fame with which all Europe rung, For his\u2014 whod best neer breathed ! "], "pos": 8, "end_pos": 18, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "OTHER", "gold": ["Q171416"]}, {"mention": "The Baltic", "sent_idx": 17, "sentence": "Let net Trafalgars chief, who died, A moments thought beguile; Nor laud with British heart and pride The Baltic and the Nile! ", "ngram": "The Baltic", "context": ["To Cossack hate consigned :\u2014 Famine and frozen wastes before, Bones and the wolf behind. ", "Leave Nelsons glorious flag unsung, And Wellington unwreathed; Their fame with which all Europe rung, For his\u2014 whod best neer breathed ! "], "pos": 101, "end_pos": 111, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q545"]}, {"mention": "Nile", "sent_idx": 17, "sentence": "Let net Trafalgars chief, who died, A moments thought beguile; Nor laud with British heart and pride The Baltic and the Nile! ", "ngram": "Nile", "context": ["To Cossack hate consigned :\u2014 Famine and frozen wastes before, Bones and the wolf behind. ", "Leave Nelsons glorious flag unsung, And Wellington unwreathed; Their fame with which all Europe rung, For his\u2014 whod best neer breathed ! "], "pos": 120, "end_pos": 124, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q3392"]}], "3896074_18": [{"mention": "Europe", "sent_idx": 18, "sentence": "Leave Nelsons glorious flag unsung, And Wellington unwreathed; Their fame with which all Europe rung, For his\u2014 whod best neer breathed ! ", "ngram": "Europe", "context": ["Let net Trafalgars chief, who died, A moments thought beguile; Nor laud with British heart and pride The Baltic and the Nile! ", "And gloze the tyrants guilty mood, And wail his hapless cause: That Sylla\u2014in his thirst for blood ! "], "pos": 89, "end_pos": 95, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q46"]}], "3896074_21": [{"mention": "France", "sent_idx": 21, "sentence": "Mourn him who to the conscript gave HeHrt-broken France a prey; And sill could outraged nature brave With homicidal sway ! ", "ngram": "France", "context": ["That Draco\u2014in his laws ! ", "Yes, wake the throb of sympathyBid maudlin tears reveal How much men grieve for Europe freeHow miss the tyrants steel! "], "pos": 49, "end_pos": 55, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q142"]}], "3896074_22": [{"mention": "Europe", "sent_idx": 22, "sentence": "Yes, wake the throb of sympathyBid maudlin tears reveal How much men grieve for Europe freeHow miss the tyrants steel! ", "ngram": "Europe", "context": ["Mourn him who to the conscript gave HeHrt-broken France a prey; And sill could outraged nature brave With homicidal sway ! ", "And make the conscious sea blush gore In shame for Nelsons land ; Make earth, revolting, lift once more Her blood-accusing hand ! "], "pos": 80, "end_pos": 86, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q46"]}], "3691199_2": [{"mention": "St. Martins-place", "sent_idx": 2, "sentence": "The incorporated society for promoting the enlargement, building, and repairing of churches and chapels, held their nineteenth annual general committee on Friday week, at their chambers in St. Martins-place", "ngram": "St. Martins-place", "context": ["CHURCHES AND CHAPELS. ", "; the Archbishop of Canterbury in the chair. "], "pos": 189, "end_pos": 206, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "BUILDING", "gold": "NIL"}], "3691199_13": [{"mention": "Bath", "sent_idx": 13, "sentence": "Contributions had been received from diocesan associations in Bath and Wells, Winchester, Exeter, and Cleveland, which were gratifying proofs of the increasing desire to relieve the spiritual wants of the people of the Church of England. ", "ngram": "Bath", "context": ["The grants of last year were deemed particularly important, as while the society had contributed towards the increase of accommodation in many of the present churches they had contributed to the erection of forty-one additional churches and chapels, and the rebuilding of sixteen others. ", "AmjtJgst their contributions was the munificent sum of 18,000 from Mr. "], "pos": 62, "end_pos": 66, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q22889"]}, {"mention": "Wells", "sent_idx": 13, "sentence": "Contributions had been received from diocesan associations in Bath and Wells, Winchester, Exeter, and Cleveland, which were gratifying proofs of the increasing desire to relieve the spiritual wants of the people of the Church of England. ", "ngram": "Wells", "context": ["The grants of last year were deemed particularly important, as while the society had contributed towards the increase of accommodation in many of the present churches they had contributed to the erection of forty-one additional churches and chapels, and the rebuilding of sixteen others. ", "AmjtJgst their contributions was the munificent sum of 18,000 from Mr. "], "pos": 71, "end_pos": 76, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q212283"]}, {"mention": "Winchester", "sent_idx": 13, "sentence": "Contributions had been received from diocesan associations in Bath and Wells, Winchester, Exeter, and Cleveland, which were gratifying proofs of the increasing desire to relieve the spiritual wants of the people of the Church of England. ", "ngram": "Winchester", "context": ["The grants of last year were deemed particularly important, as while the society had contributed towards the increase of accommodation in many of the present churches they had contributed to the erection of forty-one additional churches and chapels, and the rebuilding of sixteen others. ", "AmjtJgst their contributions was the munificent sum of 18,000 from Mr. "], "pos": 78, "end_pos": 88, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q172157"]}, {"mention": "Exeter", "sent_idx": 13, "sentence": "Contributions had been received from diocesan associations in Bath and Wells, Winchester, Exeter, and Cleveland, which were gratifying proofs of the increasing desire to relieve the spiritual wants of the people of the Church of England. ", "ngram": "Exeter", "context": ["The grants of last year were deemed particularly important, as while the society had contributed towards the increase of accommodation in many of the present churches they had contributed to the erection of forty-one additional churches and chapels, and the rebuilding of sixteen others. ", "AmjtJgst their contributions was the munificent sum of 18,000 from Mr. "], "pos": 90, "end_pos": 96, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q134672"]}, {"mention": "Cleveland", "sent_idx": 13, "sentence": "Contributions had been received from diocesan associations in Bath and Wells, Winchester, Exeter, and Cleveland, which were gratifying proofs of the increasing desire to relieve the spiritual wants of the people of the Church of England. ", "ngram": "Cleveland", "context": ["The grants of last year were deemed particularly important, as while the society had contributed towards the increase of accommodation in many of the present churches they had contributed to the erection of forty-one additional churches and chapels, and the rebuilding of sixteen others. ", "AmjtJgst their contributions was the munificent sum of 18,000 from Mr. "], "pos": 102, "end_pos": 111, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q24651706"]}, {"mention": "England", "sent_idx": 13, "sentence": "Contributions had been received from diocesan associations in Bath and Wells, Winchester, Exeter, and Cleveland, which were gratifying proofs of the increasing desire to relieve the spiritual wants of the people of the Church of England. ", "ngram": "England", "context": ["The grants of last year were deemed particularly important, as while the society had contributed towards the increase of accommodation in many of the present churches they had contributed to the erection of forty-one additional churches and chapels, and the rebuilding of sixteen others. ", "AmjtJgst their contributions was the munificent sum of 18,000 from Mr. "], "pos": 229, "end_pos": 236, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q21"]}], "3691199_15": [{"mention": "Lime-street", "sent_idx": 15, "sentence": "George Davenport, of Lime-street, London. ", "ngram": "Lime-street", "context": ["AmjtJgst their contributions was the munificent sum of 18,000 from Mr. ", "Since their institution in 1818 the society had expended \u00a3220,731, thus insuring 354,925 additional sittings, of which 262,366 were free and unappropriated. "], "pos": 21, "end_pos": 32, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "STREET", "gold": "NIL"}, {"mention": "London", "sent_idx": 15, "sentence": "George Davenport, of Lime-street, London. ", "ngram": "London", "context": ["AmjtJgst their contributions was the munificent sum of 18,000 from Mr. ", "Since their institution in 1818 the society had expended \u00a3220,731, thus insuring 354,925 additional sittings, of which 262,366 were free and unappropriated. "], "pos": 34, "end_pos": 40, "place": "Manchester", "place_wqid": "Q18125", "candidates": [], "ner_label": "LOC", "gold": ["Q84"]}], "12275_1": [{"mention": "LYND H UR ST", "sent_idx": 1, "sentence": "LYND H UR ST. ", "ngram": "LYND H UR ST", "context": ["", "CAPITAL RUN WITH THE New FOREST Fox HOUND. "], "pos": 0, "end_pos": 12, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q3182986"]}], "12275_2": [{"mention": "New FOREST", "sent_idx": 2, "sentence": "CAPITAL RUN WITH THE New FOREST Fox HOUND. ", "ngram": "New FOREST", "context": ["LYND H UR ST. ", "\u2014On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. "], "pos": 21, "end_pos": 31, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q277755"]}], "12275_3": [{"mention": "New Forest", "sent_idx": 3, "sentence": "\u2014On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. ", "ngram": "New Forest", "context": ["CAPITAL RUN WITH THE New FOREST Fox HOUND. ", "%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. "], "pos": 35, "end_pos": 45, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q277755"]}, {"mention": "Boldrewood", "sent_idx": 3, "sentence": "\u2014On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. ", "ngram": "Boldrewood", "context": ["CAPITAL RUN WITH THE New FOREST Fox HOUND. ", "%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. "], "pos": 64, "end_pos": 74, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q4939103"]}, {"mention": "Ringwood", "sent_idx": 3, "sentence": "\u2014On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. ", "ngram": "Ringwood", "context": ["CAPITAL RUN WITH THE New FOREST Fox HOUND. ", "%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. "], "pos": 188, "end_pos": 196, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q1248943"]}], "12275_4": [{"mention": "Church Moor", "sent_idx": 4, "sentence": "%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", "ngram": "Church Moor", "context": ["\u2014On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. ", "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill"], "pos": 41, "end_pos": 52, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": "NIL"}, {"mention": "Burley", "sent_idx": 4, "sentence": "%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", "ngram": "Burley", "context": ["\u2014On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. ", "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill"], "pos": 84, "end_pos": 90, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q3195509"]}, {"mention": "Burley", "sent_idx": 4, "sentence": "%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", "ngram": "Burley", "context": ["\u2014On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. ", "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill"], "pos": 101, "end_pos": 107, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q3195509"]}, {"mention": "Beech Beds", "sent_idx": 4, "sentence": "%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", "ngram": "Beech Beds", "context": ["\u2014On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. ", "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill"], "pos": 147, "end_pos": 157, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": "NIL"}, {"mention": "Oakley", "sent_idx": 4, "sentence": "%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", "ngram": "Oakley", "context": ["\u2014On Thursday, the 9th instant, the New Forest fox hounds met at Boldrewood, arid drew Holme Hill Inclousure, where the famous bitch pock soon winded a Inc old fox, who stole away over the Ringwood rued to Gillett Inclusure. ", "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill"], "pos": 175, "end_pos": 181, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": "NIL"}], "12275_5": [{"mention": "Buldre. wood", "sent_idx": 5, "sentence": "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill", "ngram": "Buldre. wood", "context": ["%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", ", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. "], "pos": 30, "end_pos": 42, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q4939103"]}, {"mention": "Gillett Inclosure", "sent_idx": 5, "sentence": "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill", "ngram": "Gillett Inclosure", "context": ["%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", ", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. "], "pos": 51, "end_pos": 68, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": "NIL"}, {"mention": "Hulme Hill", "sent_idx": 5, "sentence": "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill", "ngram": "Hulme Hill", "context": ["%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", ", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. "], "pos": 82, "end_pos": 92, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": "NIL"}, {"mention": "Thrifty Beeches", "sent_idx": 5, "sentence": "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill", "ngram": "Thrifty Beeches", "context": ["%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", ", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. "], "pos": 97, "end_pos": 112, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": "NIL"}, {"mention": "Over Fields", "sent_idx": 5, "sentence": "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill", "ngram": "Over Fields", "context": ["%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", ", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. "], "pos": 216, "end_pos": 227, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": "NIL"}, {"mention": "Holme !Jill", "sent_idx": 5, "sentence": "He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill", "ngram": "Holme !Jill", "context": ["%here he turned to the right and crossed Church Moor, thence at a clipping pace, to Burley ; leaving Burley to the left, he pushed forward through Beech Beds to the earths at Oakley, where, to his aurprse, he found no admission. ", ", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. "], "pos": 264, "end_pos": 275, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": "NIL"}], "12275_6": [{"mention": "Emery Down", "sent_idx": 6, "sentence": ", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. ", "ngram": "Emery Down", "context": ["He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill", "RESIGNATION Of Elll TIMSON, MASTEI OF THE New FOREST Fox HOUNDS.\u2014 We regret to announce the resignation of Mr. fimson, the esteemed master of the New Forest hourids, who has hunted the country for the last five seasons, to the entire satisfaction of those gentlemen, members of the hunt and others, who have been fertunate enough to witness some of the splendid sport which he has so often afforded during his career as such ; and although he relinquishes the mastership, we earnestly hope that a continuance of good health will enahle Mr. f imson to take is usual prominent place in the bunting field for many years to come. "], "pos": 12, "end_pos": 22, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q5370984"]}, {"mention": "Minesteed Manor", "sent_idx": 6, "sentence": ", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. ", "ngram": "Minesteed Manor", "context": ["He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill", "RESIGNATION Of Elll TIMSON, MASTEI OF THE New FOREST Fox HOUNDS.\u2014 We regret to announce the resignation of Mr. fimson, the esteemed master of the New Forest hourids, who has hunted the country for the last five seasons, to the entire satisfaction of those gentlemen, members of the hunt and others, who have been fertunate enough to witness some of the splendid sport which he has so often afforded during his career as such ; and although he relinquishes the mastership, we earnestly hope that a continuance of good health will enahle Mr. f imson to take is usual prominent place in the bunting field for many years to come. "], "pos": 34, "end_pos": 49, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "BUILDING", "gold": "NIL"}, {"mention": "Notherwood", "sent_idx": 6, "sentence": ", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. ", "ngram": "Notherwood", "context": ["He then made a turn back over Buldre. wood Hill to Gillett Inclosure, and through Hulme Hill, to Thrifty Beeches ; here he again turned to the lett, sod ran almost in a straight direction \u2022 distance of five miles to Over Fields ; then took \u2022 backward direction to Holme !Jill", "RESIGNATION Of Elll TIMSON, MASTEI OF THE New FOREST Fox HOUNDS.\u2014 We regret to announce the resignation of Mr. fimson, the esteemed master of the New Forest hourids, who has hunted the country for the last five seasons, to the entire satisfaction of those gentlemen, members of the hunt and others, who have been fertunate enough to witness some of the splendid sport which he has so often afforded during his career as such ; and although he relinquishes the mastership, we earnestly hope that a continuance of good health will enahle Mr. f imson to take is usual prominent place in the bunting field for many years to come. "], "pos": 75, "end_pos": 85, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "BUILDING", "gold": "NIL"}], "12275_7": [{"mention": "New FOREST", "sent_idx": 7, "sentence": "RESIGNATION Of Elll TIMSON, MASTEI OF THE New FOREST Fox HOUNDS.\u2014 We regret to announce the resignation of Mr. fimson, the esteemed master of the New Forest hourids, who has hunted the country for the last five seasons, to the entire satisfaction of those gentlemen, members of the hunt and others, who have been fertunate enough to witness some of the splendid sport which he has so often afforded during his career as such ; and although he relinquishes the mastership, we earnestly hope that a continuance of good health will enahle Mr. f imson to take is usual prominent place in the bunting field for many years to come. ", "ngram": "New FOREST", "context": [", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. ", "It should be remembered that Mr. "], "pos": 42, "end_pos": 52, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q277755"]}, {"mention": "New Forest", "sent_idx": 7, "sentence": "RESIGNATION Of Elll TIMSON, MASTEI OF THE New FOREST Fox HOUNDS.\u2014 We regret to announce the resignation of Mr. fimson, the esteemed master of the New Forest hourids, who has hunted the country for the last five seasons, to the entire satisfaction of those gentlemen, members of the hunt and others, who have been fertunate enough to witness some of the splendid sport which he has so often afforded during his career as such ; and although he relinquishes the mastership, we earnestly hope that a continuance of good health will enahle Mr. f imson to take is usual prominent place in the bunting field for many years to come. ", "ngram": "New Forest", "context": [", thence to Emery Down,crowing to Minesteed Manor ; he ther tacked back to Notherwood, and from thence back again to the Manor, where, after a brilliant run (Arnie hour and forty-five minutes, Reynold was compelled to succumb to his pursuers. ", "It should be remembered that Mr. "], "pos": 146, "end_pos": 156, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q277755"]}], "12275_14": [{"mention": "POOLE", "sent_idx": 14, "sentence": "POOLE.", "ngram": "POOLE", "context": ["SHIPPING INTELLIGENCr:. ", ""], "pos": 0, "end_pos": 5, "place": "Poole", "place_wqid": "Q203349", "candidates": [], "ner_label": "LOC", "gold": ["Q203349"]}]} \ No newline at end of file diff --git a/tests/sample_files/resources/deezymatch/data/w2v_ocr_pairs_test.txt b/tests/sample_files/resources/deezymatch/data/w2v_ocr_pairs_test.txt new file mode 100644 index 00000000..7278aedf --- /dev/null +++ b/tests/sample_files/resources/deezymatch/data/w2v_ocr_pairs_test.txt @@ -0,0 +1,200 @@ +russia france FALSE +woman child FALSE +determined vigourously FALSE +determined reconstitute FALSE +determined resolves FALSE +determined proposes FALSE +determined resolute FALSE +labour earnings FALSE +labour industry FALSE +labour cheapen FALSE +labour drudgery FALSE +labour idleness FALSE +labour disables FALSE +labour sampling FALSE +superior quality FALSE +superior finest FALSE +superior excellence FALSE +superior duality FALSE +superior excellent FALSE +superior champagnes FALSE +superior refractive FALSE +superior pliability FALSE +superior keying FALSE +superior overlays FALSE +superior cheapness FALSE +share profits FALSE +share profit FALSE +share deposit FALSE +share outbid FALSE +share broking FALSE +proceed embark FALSE +proceed depart FALSE +proceed disembark FALSE +proceed assemble FALSE +proceed arrive FALSE +proceed accompany FALSE +proceed resume FALSE +dinner banquet FALSE +language expression FALSE +language epithets FALSE +language soberness FALSE +language fluently FALSE +language epithet FALSE +language couched FALSE +language dialect FALSE +major lieut FALSE +refused alleging FALSE +refused compel FALSE +refused allow FALSE +refused unable FALSE +refused unwilling FALSE +refused traynor FALSE +refused demanding FALSE +refused comply FALSE +refused ruhman FALSE +judge denman FALSE +judge empanel FALSE +judge bolland FALSE +introduced referral FALSE +introduced codification FALSE +introduced analagous FALSE +introduced validating FALSE +introduced eulogizes FALSE +introduced adaption FALSE +facts truth FALSE +facts lucidly FALSE +facts deduce FALSE +direct ratifies FALSE +direct positive FALSE +direct tchad FALSE +direct override FALSE +direct falters FALSE +popular democracy FALSE +increased sixfold FALSE +increased diminish FALSE +increased tenfold FALSE +increased augment FALSE +increased lessening FALSE +increased fivefold FALSE +increased augmenting FALSE +increased subtract FALSE +increased fourfold FALSE +opposed hostile FALSE +opposed crotchety FALSE +opposed reformist FALSE +opposed variance FALSE +opposed support FALSE +opposed resist FALSE +opposed inimical FALSE +opposed favour FALSE +opposed layout FALSE +guilty wilful FALSE +guilty offence FALSE +guilty pleaded FALSE +guilty accused FALSE +guilty heinous FALSE +guilty abetting FALSE +guilty murder FALSE +guilty crime FALSE +guilty offense FALSE +dakotah dakotah TRUE +dakotah dacotah TRUE +dakotah dakotah TRUE +dakotah dacotab TRUE +dakotah dakotah TRUE +charioteer charioteer TRUE +gutman gutman TRUE +loquitur loquitur TRUE +quesada quesada TRUE +deaconry deaconry TRUE +incontinent incontinent TRUE +campeche campeche TRUE +propter propter TRUE +antia antia TRUE +flavigny flavigny TRUE +hightail hightail TRUE +sucrose sucrose TRUE +amsden amsden TRUE +amsden rameden TRUE +amsden amsden TRUE +boozy boozy TRUE +grayrigg grayrigg TRUE +harmsworth harmsworth TRUE +gottfried gottfried TRUE +avalos avalos TRUE +stainland stainland TRUE +taton taton TRUE +vinos vinos TRUE +giselle giselle TRUE +emphases emphases TRUE +emphases emplanes TRUE +emphases emphases TRUE +flatiron flatiron TRUE +daggs daggs TRUE +unconventional unconventional TRUE +unconventional unpoetical TRUE +unconventional unconventional TRUE +precociously precociously TRUE +mayville mayville TRUE +pralines pralines TRUE +birtles birtles TRUE +potchefstroom potchefstroom TRUE +saporta saporta TRUE +ostracise ostracise TRUE +ciento ciento TRUE +effenberg effenberg TRUE +magnusson magnusson TRUE +chipstead chipstead TRUE +meols meols TRUE +skunks skunks TRUE +rayners rayners TRUE +suleiman suleiman TRUE +kocher kocher TRUE +mandrell mandrell TRUE +purveyed purveyed TRUE +roxana roxana TRUE +lookup lookup TRUE +laborie laborie TRUE +laborie labedoyere TRUE +laborie laborie TRUE +materiality materiality TRUE +valter valter TRUE +valter waltor TRUE +valter valter TRUE +embolism embolism TRUE +mazzetti mazzetti TRUE +digesters digesters TRUE +hindes hindes TRUE +stopgap stopgap TRUE +classon classon TRUE +meiji meiji TRUE +plett plett TRUE +skylarks skylarks TRUE +zadok zadok TRUE +nicols nicols TRUE +nicols nickolls TRUE +nicols nicols TRUE +burdock burdock TRUE +combahee combahee TRUE +managua managua TRUE +pepped pepped TRUE +cantar cantar TRUE +adorably adorably TRUE +mencius mencius TRUE +ogaden ogaden TRUE +timia timia TRUE +uncorrupt uncorrupt TRUE +garron garron TRUE +paribas paribas TRUE +dinkins dinkins TRUE +kindl kindl TRUE +bartholemew bartholemew TRUE +bartholemew bartholomews TRUE +bartholemew bartholemew TRUE +lyttle lyttle TRUE +lyttle lyttel TRUE +lyttle lyttle TRUE +visioned visioned TRUE +orthopaedic orthopaedic TRUE +dunkerton dunkerton TRUE \ No newline at end of file diff --git a/tests/sample_files/resources/deezymatch/models/w2v_ocr/input_dfm.yaml b/tests/sample_files/resources/deezymatch/models/w2v_ocr/input_dfm.yaml new file mode 100644 index 00000000..b777d226 --- /dev/null +++ b/tests/sample_files/resources/deezymatch/models/w2v_ocr/input_dfm.yaml @@ -0,0 +1,111 @@ +general: + use_gpu: False # only if available + # specify CUDA device, these are 0-indexed, e.g., + # cuda:0, cuda:1 or others. "cuda" is the default CUDA device + gpu_device: "cuda" + # Parent dir to save trained models + models_dir: "../resources/deezymatch/models" + +preprocessing: + # normalizing text to the ASCII encoding standard + uni2ascii: True + lowercase: True + # removing both the leading and the trailing empty characters + strip: True + only_latin_letters: False + # Accepted proportion of characters in a string that are not present in our vocabulary, i.e., + # String is accepted if: + # (number of its characters found in the vocabulary)/(total number its characters) >= missing_char_threshold + missing_char_threshold: 0.5 + # read a list of characters and add to the vocabulary + read_list_chars: "../resources/deezymatch/inputs/characters_v001.vocab" + +# --- RNN/GRU/LSTM architecture/misc info +gru_lstm: + main_architecture: "gru" # rnn, gru, lstm + mode: # Tokenization mode + # choices: "char", "ngram", "word" + # for example: tokenize: ["char", "ngram", "word"] or ["char", "word"] + tokenize: ["char"] + # ONLY if "char" or "ngram" are slected in tokenize, the following arg will be used: + # Strings in the inputs will be prefix + string + suffix: + # NOTE: please use only STRINGS in prefix_suffix list, + # if you want only prefix or suffix, enter the other as an empty string "" + # e.g., ["|", ""] would add | as prefix and no suffix + prefix_suffix: ["", ""] + # ONLY if "ngram" is selected in tokenize, the following args will be used: + min_gram: 4 + max_gram: 5 + # ONLY if "word" is selected in tokenize, the following arg will be used: + # Characters according to which the string will split into tokens: + # Accepted inputs are either a string of delimiters (e.g. ", -!?:()" or + # "default", in which case delimiters will be the white space and any + # punctuation mark): + token_sep: "default" + bidirectional: True # if True, becomes a bidirectional RNN/GRU/LSTM + # num_layers + # number of recurrent layers. e.g., setting num_layers=2 means stacking two + # RNN/GRU/LSTMs together to form a stacked RNN/GRU/LSTM, + # with the second RNN/GRU/LSTM taking in outputs of the first RNN/GRU/LSTM and computing the final results. + num_layers: 2 + # number of dimensions of the first fully connected network + fc1_out_dim: 120 + # pooling_mode: + # hstates_layers_simple, hstates_layers, hstates + # hstates_subtract, hstates_l2_distance, hstates_cosine + # average, max + # attention + pooling_mode: 'average' + # rnn_dropout: + # if non-zero, introduces a Dropout layer on the outputs of each RNN/LSTM/GRU layer except the last layer, + # with dropout probability equal to rnn_dropout. + rnn_dropout: 0.1 + # fully-connected layers dropout depends on the number of fc layers (currently there are two) + fc_dropout: [0.1, 0.1] + # attention layer dropout depends on the number of attention layers (currently there are two) + att_dropout: [0.1, 0.1] + # Add bias to all learnable parameters + bias: True + + rnn_hidden_dim: 60 + max_seq_len: 120 + embedding_dim: 60 + output_dim: 2 + + learning_rate: 0.005 + optimizer: adam + epochs: 5 + batch_size: 32 + # shuffle when creating DataLoader + dl_shuffle: True + random_seed: 123 + # Early stopping: + # Number of epochs with no improvement after which training will be stopped and + # the model with the least validation loss will be saved + # If 0 or negative, early stopping will be deactivated + early_stopping_patience: 1 + + # if -1 or 1, perform the validation step in every epoch; + # if 0, no validation will be done + # otherwise, specify the interval (integer) + validation: 1 + # split dataset + train_proportion: 0.85 + val_proportion: 0.15 + test_proportion: 0 + + # False or path to a directory to create tensor-board + #create_tensor_board: "./tb_gru_test" + create_tensor_board: False + + # Layers to freeze during fine-tuning + layers_to_freeze: ["emb", "rnn_1", "attn"] + +inference: + # Output predictions and save the results in output_preds_file + output_preds: True + # either a path or "default" + # "default" saves the prediction output inside the model directory + output_preds_file: "default" + # change it to true to have Mean Average Precision as an eval metric. Note that this would have an impact on computational time + eval_map_metric: False diff --git a/tests/sample_files/resources/deezymatch/models/w2v_ocr/w2v_ocr.model b/tests/sample_files/resources/deezymatch/models/w2v_ocr/w2v_ocr.model new file mode 100644 index 00000000..d9071a77 Binary files /dev/null and b/tests/sample_files/resources/deezymatch/models/w2v_ocr/w2v_ocr.model differ diff --git a/tests/sample_files/resources/deezymatch/models/w2v_ocr/w2v_ocr.model_state_dict b/tests/sample_files/resources/deezymatch/models/w2v_ocr/w2v_ocr.model_state_dict new file mode 100644 index 00000000..574e0b14 Binary files /dev/null and b/tests/sample_files/resources/deezymatch/models/w2v_ocr/w2v_ocr.model_state_dict differ diff --git a/tests/sample_files/resources/deezymatch/models/w2v_ocr/w2v_ocr.vocab b/tests/sample_files/resources/deezymatch/models/w2v_ocr/w2v_ocr.vocab new file mode 100644 index 00000000..68721ef8 Binary files /dev/null and b/tests/sample_files/resources/deezymatch/models/w2v_ocr/w2v_ocr.vocab differ diff --git a/tests/sample_files/resources/news_datasets/topRes19th_v2/.DS_Store b/tests/sample_files/resources/news_datasets/topRes19th_v2/.DS_Store new file mode 100644 index 00000000..afa651e9 Binary files /dev/null and b/tests/sample_files/resources/news_datasets/topRes19th_v2/.DS_Store differ diff --git a/tests/sample_files/resources/news_datasets/topRes19th_v2/README.md b/tests/sample_files/resources/news_datasets/topRes19th_v2/README.md new file mode 100644 index 00000000..00274f6e --- /dev/null +++ b/tests/sample_files/resources/news_datasets/topRes19th_v2/README.md @@ -0,0 +1,121 @@ +# A Dataset for Toponym Resolution in Nineteenth-Century English Newspapers + +## Description + +We present a new dataset (version 2) for the task of toponym resolution in digitised historical newspapers in English. It consists of 455 annotated articles from newspapers based in four different locations in England (Manchester, Ashton-under-Lyne, Poole and Dorchester), published between 1780 and 1870. The articles have been manually annotated with mentions of places, which are linked---whenever possible---to their corresponding entry on Wikipedia. The dataset is published on the British Library shared research repository, and is especially of interest to researchers working on improving semantic access to historical newspaper content. + +We share the 455 annotated files (one file per article) in the WebAnno TSV file format version 3.2, a CoNLL-based file format. The files are split into a train and test set. For each split, we additionally provide a TSV file with metadata at the article level. We also provide the original annotation guidelines. + +## Directory structure + +```bash= +topRes19th_v2/ +├── README.md +├── train/ +│ ├── annotated_tsv/ +│ │ ├── 1218_Poole1860.tsv +│ │ ├── ... +│ │ └── 10877685_Dorchester1830.tsv +│ └── metadata.tsv +├── test/ +│ ├── annotated_tsv/ +│ │ ├── 9144_Poole1860.tsv +│ │ ├── ... +│ │ └── 10860796_Dorchester1860.tsv +│ └── metadata.tsv +└── original_guidelines.md +``` + +## Data description + +### `[split]/annotated_tsv/*.tsv` + +Each WebAnno TSV file in `annotated_tsv/` corresponds to an article. The file names (e.g. `1218_Poole1860.tsv`) consist of three elements: an internal Living with Machines identifier of the article (`1218`), the place of publication (`Poole`) and the decade of publication (`1860`). The WebAnno TSV format is a CoNLL-based file format, which has a header, is sentence-separated (by a blank line), and lists one token per line, with the different layers of annotations separated with tabs. See an example: +``` +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Customentity|identifiier|value + + +#Text=THE POOLE AND SOUTH-WESTERN HERALD, THURSDAY, OCTOBER 20, 1864. +1-1 0-3 THE _ _ +1-2 4-9 POOLE _ _ +1-3 10-13 AND _ _ +1-4 14-27 SOUTH-WESTERN _ _ +1-5 28-34 HERALD _ _ +1-6 34-35 , _ _ +1-7 36-44 THURSDAY _ _ +1-8 44-45 , _ _ +1-9 46-53 OCTOBER _ _ +1-10 54-56 20 _ _ +1-11 56-57 , _ _ +1-12 58-62 1864 _ _ +1-13 62-63 . _ _ + +#Text=POOLE TOWN COUNCIL. +2-1 65-70 POOLE https://en.wikipedia.org/wiki/Poole LOC +2-2 71-75 TOWN _ _ +2-3 76-83 COUNCIL _ _ +2-4 83-84 . _ _ +``` + +This example has two full sentences, preceded by `#Text=`, and split with one token per line. Now we look at one line in more detail: +``` +2-1 65-70 POOLE https://en.wikipedia.org/wiki/Poole LOC +``` +The tab-separated elements are: +* `2-1`: the indices of the sentence in the document and the token in the sentence. +* `65-70`: start and end character positions of the token in the document. +* `POOLE`: the token. +* `https://en.wikipedia.org/wiki/Poole`: the Wikipedia url (if linked). +* `LOC`: the toponym class. + +Toponyms are annotated with the following classes: +* `BUILDINGS`: names of buildings, such as the 'British Museum'. +* `STREET`: streets, roads, and other odonyms, such as 'Great Russell St'. +* `LOC`: any other real world places regardless of type or scale, such as 'Bloomsbury', 'London' or 'Great Britain'. +* `ALIEN`: extraterrestrial locations, such as 'Venus'. +* `FICTION`: fictional or mythical places, such as 'Hell'. +* `OTHER`: other types of entities with coordinates, such as events, like the 'Battle of Waterloo'. + + +### `metadata.tsv` + +The `metadata.tsv` file links each annotated tsv file to its metadata. It consists of a header and one row per article, with the following fields: +* `fname`: name of the annotated file, without the extension (e.g. `1218_Poole1860`) +* `word_count`: number of words in the article. +* `ocr_quality_mean`: OCR quality mean, calculated as per-word OCR confidence scores as reported in the source metadata. +* `ocr_quality_sd`: OCR quality standard deviation. +* `issue_date`: date of publication of the article. +* `publication_code`: publication code (internal). +* `publication_title`: name of the newspaper publication. +* `decade`: decade of publication of the article. +* `place_publication`: place of publication. +* `annotation_batch`: each article is assigned to one annotation batch. All annotation batches are similarly-distributed in terms of place and decade of publication. + +## License + +The dataset is released under open license CC-BY-NC-SA, available at https://creativecommons.org/licenses/by-nc-sa/4.0/. + +## Copyright notice + +Newspaper data has been provided by Findmypast Limited from the British Newspaper Archive, a partnership between the British Library and Findmypast (https://www.britishnewspaperarchive.co.uk/). + +## Funding statement + +This work was supported by Living with Machines (AHRC grant AH/S01179X/1) and The Alan Turing Institute (EPSRC grant EP/N510129/1). + +## Dataset creators + +Mariona Coll Ardanuy (conceptualization, data curation, formal analysis, project management, writing), David Beavan (resources, software, writing), Kaspar Beelen (resources, data curation, writing), Kasra Hosseini (resources, software), Jon Lawrence (conceptualization, data curation, project management), Katherine McDonough (conceptualization, data curation, writing), Federico Nanni (validation, writing), Daniel van Strien (resources, software), Daniel C.S. Wilson (conceptualization, data curation, writing). + +## Version changes + +**Version 2:** + +* Annotations (`annotated_tsv/*.tsv`): + - The toponyms that were annotated as "LOCWiki" are now annotated as "LOC". + - "UNKNOWN" has been removed from all data fields, instances of this class have been classified into the other classes (mostly "LOC"). +* Metadata (`metadata.tsv`): + - Column "publication_location" removed. + - Column "annotation_decade" renamed to "decade". + - Column "annotation_location" renamed to "place_publication". \ No newline at end of file diff --git a/tests/sample_files/resources/news_datasets/topRes19th_v2/original_guidelines.md b/tests/sample_files/resources/news_datasets/topRes19th_v2/original_guidelines.md new file mode 100644 index 00000000..17534aee --- /dev/null +++ b/tests/sample_files/resources/news_datasets/topRes19th_v2/original_guidelines.md @@ -0,0 +1,47 @@ +# Annotation guidelines + +**Note:** These are the original annotation guidelines. The annotations in the current version of the dataset have since been refined, main changes are described in the accompanying README file (under `Version changes`). + +## Toponym resolution annotations + +This document contains the annotation guidelines for marking up and georeferencing locations mentioned in historical newspaper articles. The task of the annotator is to recognize each location mentioned in the text and map it to the URL of the Wikipedia article that refers to it. + +### The task + +Place names are often highly ambiguous. There are, for instance, more than 20 different places named Paris all over the world, as well as different instances of records relating to Paris, France. The task of toponym resolution can be similar to word sense disambiguation: in most scenarios the most commonly used sense (or place) is in most cases the correct sense (or place). However, our data is mostly composed of historical local and regional newspapers, and the world view that is represented in these texts is skewed towards the knowledge expected of their intended average, regional reader. It is therefore important that annotators take into account the date and place(s) of newspaper publication/circulation during the annotation process. + +### What to annotate + +Location: any named entity of a location that is static and can be defined according to a pair of static world coordinates (including metonyms, as in 'France signed the deal.'). If there is an OCR error, we will annotate the location if we can recognise it because of context clues in the word itself or in the surrounding text (for example, we would link "iHancfjrcter" to https://en.wikipedia.org/wiki/Manchester). We will not perform any additional post-correction of the OCRed text. + +### How to annotate + +The annotator should map each location found in the text with the URL of the Wikipedia article that refers to it. + +To do so: +* Make sure you have selected the Layer `Custom entity` (if you don't see it, make sure you are in a 'Toponym resolution' project). +* Select with the mouse the span of text you want to annotate (e.g. 'West Laviogton') and select `LOCWiki` from the dropdown menu. + +In this task, the custom entity `LOCWiki` refers to a real world place regardless of scale (region, city, neighborhood) with the exception of the additional, separate categories listed below: + * `BUILDING`: Names of buildings (e.g. schools, hospitals, factories, palaces, etc.). Optional link to Wikipedia article if it exists. + * `STREET`: Streets, squares, etc. Optional link to Wikipedia article if it exists. + * `ALIEN`: Extraterrestrial locations (e.g. the moon). Optional link to Wikipedia article if it exists. + * `OTHER`: Others, as in famous trees (https://en.wikipedia.org/wiki/Lone_Cypress) or battlefields (https://en.wikipedia.org/wiki/Battle_of_Waterloo). Optional link to Wikipedia article if it exists. + * `UNKNOWN`: If the location has no Wikipedia entry OR if you cannot determine what place it is, but are confident that it is a place. No link to Wikipedia. + * `FICTION`: If it is a fictional/mythical place (e.g. Lilliput). Optional link to Wikipedia article if it exists. + +* How to annotate with Wikipedia links: + * Go to Wikipedia (English version). + * Find the correct article corresponding to the place mentioned in the text (e.g. `https://en.wikipedia.org/wiki/West_Lavington,_Wiltshire`). + * Copy the full URL and paste it to the identifier box. +* To delete an annotation, click on it and click on `Delete` in the Annotation box. + +The article title will give you an indication of the place of publication of the article, to help you disambiguate the toponyms in the article (e.g. `10713959_Dorchester1820.txt` is an article published in Dorchester, Dorset, in the 1820s---the date refers to the decade, not the year, of publication). + +Some annotation considerations: +* Choose 'historical county' record over 'ceremonial county' for county place names. +* Do not include places that are not referred to by proper names (e.g. 'the park'). +* Always favour a geo-coded link even if it is less perfect. + > For example: Bengal---a province of British Colonial India---has a wiki page but it is not geo-coded because it is an historic term for places now in India (West Bengal) and Bangladesh. The latter has been linked since it represents the bulk of British Bengal and is geo-coded. +* Do not geocode the place if it's part of a person's title ("the Earl of Warwick"). +* Company stocks and shares names after places - e.g. Westminster Bank, Devon Great Consols (mine) should NOT be linked as it is a commercial credit note linked to a trading entity. It isn't a place as such. \ No newline at end of file diff --git a/tests/sample_files/resources/news_datasets/topRes19th_v2/test/.DS_Store b/tests/sample_files/resources/news_datasets/topRes19th_v2/test/.DS_Store new file mode 100644 index 00000000..bfeec556 Binary files /dev/null and b/tests/sample_files/resources/news_datasets/topRes19th_v2/test/.DS_Store differ diff --git a/tests/sample_files/resources/news_datasets/topRes19th_v2/test/annotated_tsv/9144_Poole1860.tsv b/tests/sample_files/resources/news_datasets/topRes19th_v2/test/annotated_tsv/9144_Poole1860.tsv new file mode 100644 index 00000000..7efc6e50 --- /dev/null +++ b/tests/sample_files/resources/news_datasets/topRes19th_v2/test/annotated_tsv/9144_Poole1860.tsv @@ -0,0 +1,430 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Customentity|identifiier|value + + +#Text=WAREHAM TURNPIKE, +#Text=NOTICE is hereby given, —That the TOLLS arising at the undermentioned TOLL GATES will be LET BY AUCTION, to the best Bidden, at the Towx HALL, in WADEHAY, on TUESDAY, the 15th day of MAY next, between the hours of twelve and three, in the manner directed by an Act ionised in the third year of his Majesty King George the Fourth" for regulating Turnpike Roads," which Tolls produced last year the sum of £892 15s. 6d. above the expense of collecting them, and will be put up in the following Lots, at such sums as the Trustees present shall think fit, and to be Let from noon on the Ist July next, for such term as shall at the said Meeting be determined +#Text=Lot. +1-1 0-7 WAREHAM https://en.wikipedia.org/wiki/Wareham,\_Dorset LOC +1-2 8-16 TURNPIKE _ _ +1-3 16-17 , _ _ +1-4 19-25 NOTICE _ _ +1-5 26-28 is _ _ +1-6 29-35 hereby _ _ +1-7 36-41 given _ _ +1-8 41-42 , _ _ +1-9 43-44 — _ _ +1-10 44-48 That _ _ +1-11 49-52 the _ _ +1-12 53-58 TOLLS _ _ +1-13 59-66 arising _ _ +1-14 67-69 at _ _ +1-15 70-73 the _ _ +1-16 74-88 undermentioned _ _ +1-17 89-93 TOLL _ _ +1-18 94-99 GATES _ _ +1-19 100-104 will _ _ +1-20 105-107 be _ _ +1-21 108-111 LET _ _ +1-22 112-114 BY _ _ +1-23 115-122 AUCTION _ _ +1-24 122-123 , _ _ +1-25 124-126 to _ _ +1-26 127-130 the _ _ +1-27 131-135 best _ _ +1-28 136-142 Bidden _ _ +1-29 142-143 , _ _ +1-30 144-146 at _ _ +1-31 147-150 the _ _ +1-32 151-155 Towx *[1] BUILDING[1] +1-33 156-160 HALL *[1] BUILDING[1] +1-34 160-161 , _ _ +1-35 162-164 in _ _ +1-36 165-172 WADEHAY https://en.wikipedia.org/wiki/Wareham,\_Dorset LOC +1-37 172-173 , _ _ +1-38 174-176 on _ _ +1-39 177-184 TUESDAY _ _ +1-40 184-185 , _ _ +1-41 186-189 the _ _ +1-42 190-194 15th _ _ +1-43 195-198 day _ _ +1-44 199-201 of _ _ +1-45 202-205 MAY _ _ +1-46 206-210 next _ _ +1-47 210-211 , _ _ +1-48 212-219 between _ _ +1-49 220-223 the _ _ +1-50 224-229 hours _ _ +1-51 230-232 of _ _ +1-52 233-239 twelve _ _ +1-53 240-243 and _ _ +1-54 244-249 three _ _ +1-55 249-250 , _ _ +1-56 251-253 in _ _ +1-57 254-257 the _ _ +1-58 258-264 manner _ _ +1-59 265-273 directed _ _ +1-60 274-276 by _ _ +1-61 277-279 an _ _ +1-62 280-283 Act _ _ +1-63 284-291 ionised _ _ +1-64 292-294 in _ _ +1-65 295-298 the _ _ +1-66 299-304 third _ _ +1-67 305-309 year _ _ +1-68 310-312 of _ _ +1-69 313-316 his _ _ +1-70 317-324 Majesty _ _ +1-71 325-329 King _ _ +1-72 330-336 George _ _ +1-73 337-340 the _ _ +1-74 341-347 Fourth _ _ +1-75 347-348 " _ _ +1-76 349-352 for _ _ +1-77 353-363 regulating _ _ +1-78 364-372 Turnpike _ _ +1-79 373-378 Roads _ _ +1-80 378-379 , _ _ +1-81 379-380 " _ _ +1-82 381-386 which _ _ +1-83 387-392 Tolls _ _ +1-84 393-401 produced _ _ +1-85 402-406 last _ _ +1-86 407-411 year _ _ +1-87 412-415 the _ _ +1-88 416-419 sum _ _ +1-89 420-422 of _ _ +1-90 423-427 £892 _ _ +1-91 428-431 15s _ _ +1-92 431-432 . _ _ +1-93 433-435 6d _ _ +1-94 435-436 . _ _ +1-95 437-442 above _ _ +1-96 443-446 the _ _ +1-97 447-454 expense _ _ +1-98 455-457 of _ _ +1-99 458-468 collecting _ _ +1-100 469-473 them _ _ +1-101 473-474 , _ _ +1-102 475-478 and _ _ +1-103 479-483 will _ _ +1-104 484-486 be _ _ +1-105 487-490 put _ _ +1-106 491-493 up _ _ +1-107 494-496 in _ _ +1-108 497-500 the _ _ +1-109 501-510 following _ _ +1-110 511-515 Lots _ _ +1-111 515-516 , _ _ +1-112 517-519 at _ _ +1-113 520-524 such _ _ +1-114 525-529 sums _ _ +1-115 530-532 as _ _ +1-116 533-536 the _ _ +1-117 537-545 Trustees _ _ +1-118 546-553 present _ _ +1-119 554-559 shall _ _ +1-120 560-565 think _ _ +1-121 566-569 fit _ _ +1-122 569-570 , _ _ +1-123 571-574 and _ _ +1-124 575-577 to _ _ +1-125 578-580 be _ _ +1-126 581-584 Let _ _ +1-127 585-589 from _ _ +1-128 590-594 noon _ _ +1-129 595-597 on _ _ +1-130 598-601 the _ _ +1-131 602-605 Ist _ _ +1-132 606-610 July _ _ +1-133 611-615 next _ _ +1-134 615-616 , _ _ +1-135 617-620 for _ _ +1-136 621-625 such _ _ +1-137 626-630 term _ _ +1-138 631-633 as _ _ +1-139 634-639 shall _ _ +1-140 640-642 at _ _ +1-141 643-646 the _ _ +1-142 647-651 said _ _ +1-143 652-659 Meeting _ _ +1-144 660-662 be _ _ +1-145 663-673 determined _ _ +1-146 675-678 Lot _ _ +1-147 678-679 . _ _ + +#Text=I. +2-1 680-681 I _ _ +2-2 681-682 . _ _ + +#Text=—Stoborough and Side Gates, 2. +3-1 682-683 — _ _ +3-2 683-693 Stoborough https://en.wikipedia.org/wiki/Stoborough LOC +3-3 694-697 and _ _ +3-4 698-702 Side _ _ +3-5 703-708 Gates _ _ +3-6 708-709 , _ _ +3-7 710-711 2 _ _ +3-8 711-712 . _ _ + +#Text=—Heston and Ulwell Gates, S. +4-1 712-713 — _ _ +4-2 713-719 Heston https://en.wikipedia.org/wiki/Herston,\_Dorset LOC +4-3 720-723 and _ _ +4-4 724-730 Ulwell * LOC +4-5 731-736 Gates _ _ +4-6 736-737 , _ _ +4-7 738-739 S _ _ +4-8 739-740 . _ _ + +#Text=—Westport Gate, 4. +5-1 740-741 — _ _ +5-2 741-749 Westport * LOC +5-3 750-754 Gate _ _ +5-4 754-755 , _ _ +5-5 756-757 4 _ _ +5-6 757-758 . _ _ + +#Text=—Wortbport Gates and Side Gate. +6-1 758-759 — _ _ +6-2 759-768 Wortbport * LOC +6-3 769-774 Gates _ _ +6-4 775-778 and _ _ +6-5 779-783 Side _ _ +6-6 784-788 Gate _ _ +6-7 788-789 . _ _ + +#Text=Whoever happen+ to be the best bidder must at the same time pay one month in advance of the rent at which such Tolls may be let, and give security, with sufficient sureties, to the satisfaction of the Trustees, for payment of the rest of the money monthly. +7-1 790-797 Whoever _ _ +7-2 798-804 happen _ _ +7-3 804-805 + _ _ +7-4 806-808 to _ _ +7-5 809-811 be _ _ +7-6 812-815 the _ _ +7-7 816-820 best _ _ +7-8 821-827 bidder _ _ +7-9 828-832 must _ _ +7-10 833-835 at _ _ +7-11 836-839 the _ _ +7-12 840-844 same _ _ +7-13 845-849 time _ _ +7-14 850-853 pay _ _ +7-15 854-857 one _ _ +7-16 858-863 month _ _ +7-17 864-866 in _ _ +7-18 867-874 advance _ _ +7-19 875-877 of _ _ +7-20 878-881 the _ _ +7-21 882-886 rent _ _ +7-22 887-889 at _ _ +7-23 890-895 which _ _ +7-24 896-900 such _ _ +7-25 901-906 Tolls _ _ +7-26 907-910 may _ _ +7-27 911-913 be _ _ +7-28 914-917 let _ _ +7-29 917-918 , _ _ +7-30 919-922 and _ _ +7-31 923-927 give _ _ +7-32 928-936 security _ _ +7-33 936-937 , _ _ +7-34 938-942 with _ _ +7-35 943-953 sufficient _ _ +7-36 954-962 sureties _ _ +7-37 962-963 , _ _ +7-38 964-966 to _ _ +7-39 967-970 the _ _ +7-40 971-983 satisfaction _ _ +7-41 984-986 of _ _ +7-42 987-990 the _ _ +7-43 991-999 Trustees _ _ +7-44 999-1000 , _ _ +7-45 1001-1004 for _ _ +7-46 1005-1012 payment _ _ +7-47 1013-1015 of _ _ +7-48 1016-1019 the _ _ +7-49 1020-1024 rest _ _ +7-50 1025-1027 of _ _ +7-51 1028-1031 the _ _ +7-52 1032-1037 money _ _ +7-53 1038-1045 monthly _ _ +7-54 1045-1046 . _ _ + +#Text=Persons intending to bid, and whose sureties shall not be present at the time of Letting must come prepared with a written consent from two persons to become such sureties, and the Trustee+ particularly desire the bidders to take notice that such sureties must be strictly responsible to the full extent of the rent. +8-1 1047-1054 Persons _ _ +8-2 1055-1064 intending _ _ +8-3 1065-1067 to _ _ +8-4 1068-1071 bid _ _ +8-5 1071-1072 , _ _ +8-6 1073-1076 and _ _ +8-7 1077-1082 whose _ _ +8-8 1083-1091 sureties _ _ +8-9 1092-1097 shall _ _ +8-10 1098-1101 not _ _ +8-11 1102-1104 be _ _ +8-12 1105-1112 present _ _ +8-13 1113-1115 at _ _ +8-14 1116-1119 the _ _ +8-15 1120-1124 time _ _ +8-16 1125-1127 of _ _ +8-17 1128-1135 Letting _ _ +8-18 1136-1140 must _ _ +8-19 1141-1145 come _ _ +8-20 1146-1154 prepared _ _ +8-21 1155-1159 with _ _ +8-22 1160-1161 a _ _ +8-23 1162-1169 written _ _ +8-24 1170-1177 consent _ _ +8-25 1178-1182 from _ _ +8-26 1183-1186 two _ _ +8-27 1187-1194 persons _ _ +8-28 1195-1197 to _ _ +8-29 1198-1204 become _ _ +8-30 1205-1209 such _ _ +8-31 1210-1218 sureties _ _ +8-32 1218-1219 , _ _ +8-33 1220-1223 and _ _ +8-34 1224-1227 the _ _ +8-35 1228-1235 Trustee _ _ +8-36 1235-1236 + _ _ +8-37 1237-1249 particularly _ _ +8-38 1250-1256 desire _ _ +8-39 1257-1260 the _ _ +8-40 1261-1268 bidders _ _ +8-41 1269-1271 to _ _ +8-42 1272-1276 take _ _ +8-43 1277-1283 notice _ _ +8-44 1284-1288 that _ _ +8-45 1289-1293 such _ _ +8-46 1294-1302 sureties _ _ +8-47 1303-1307 must _ _ +8-48 1308-1310 be _ _ +8-49 1311-1319 strictly _ _ +8-50 1320-1331 responsible _ _ +8-51 1332-1334 to _ _ +8-52 1335-1338 the _ _ +8-53 1339-1343 full _ _ +8-54 1344-1350 extent _ _ +8-55 1351-1353 of _ _ +8-56 1354-1357 the _ _ +8-57 1358-1362 rent _ _ +8-58 1362-1363 . _ _ + +#Text=THOMAS P HIP PARD, Clerk to the Trustees. +9-1 1364-1370 THOMAS _ _ +9-2 1371-1372 P _ _ +9-3 1373-1376 HIP _ _ +9-4 1377-1381 PARD _ _ +9-5 1381-1382 , _ _ +9-6 1383-1388 Clerk _ _ +9-7 1389-1391 to _ _ +9-8 1392-1395 the _ _ +9-9 1396-1404 Trustees _ _ +9-10 1404-1405 . _ _ + +#Text=Dated Wareham, 15th March, 1880. +10-1 1406-1411 Dated _ _ +10-2 1412-1419 Wareham https://en.wikipedia.org/wiki/Wareham,\_Dorset LOC +10-3 1419-1420 , _ _ +10-4 1421-1425 15th _ _ +10-5 1426-1431 March _ _ +10-6 1431-1432 , _ _ +10-7 1433-1437 1880 _ _ +10-8 1437-1438 . _ _ + +#Text=Potatoes ! +11-1 1440-1448 Potatoes _ _ +11-2 1449-1450 ! _ _ + +#Text=Pota•oee ! +12-1 1451-1455 Pota _ _ +12-2 1455-1456 • _ _ +12-3 1456-1459 oee _ _ +12-4 1460-1461 ! _ _ + +#Text=! +13-1 1462-1463 ! _ _ + +#Text=Potatoes !! +14-1 1464-1472 Potatoes _ _ +14-2 1473-1474 ! _ _ +14-3 1474-1475 ! _ _ + +#Text=! +15-1 1476-1477 ! _ _ + +#Text=JUST ARRIVED. +16-1 1478-1482 JUST _ _ +16-2 1483-1490 ARRIVED _ _ +16-3 1490-1491 . _ _ + +#Text=It.PRIME SAMPLE of Red Scotch and Regent POTATOES, Warranted excellent in quality for eating, and well adapted as a change fur Seed, are NOW SELLING at the Potatoe Stores of Mr. +17-1 1492-1500 It.PRIME _ _ +17-2 1501-1507 SAMPLE _ _ +17-3 1508-1510 of _ _ +17-4 1511-1514 Red _ _ +17-5 1515-1521 Scotch _ _ +17-6 1522-1525 and _ _ +17-7 1526-1532 Regent _ _ +17-8 1533-1541 POTATOES _ _ +17-9 1541-1542 , _ _ +17-10 1543-1552 Warranted _ _ +17-11 1553-1562 excellent _ _ +17-12 1563-1565 in _ _ +17-13 1566-1573 quality _ _ +17-14 1574-1577 for _ _ +17-15 1578-1584 eating _ _ +17-16 1584-1585 , _ _ +17-17 1586-1589 and _ _ +17-18 1590-1594 well _ _ +17-19 1595-1602 adapted _ _ +17-20 1603-1605 as _ _ +17-21 1606-1607 a _ _ +17-22 1608-1614 change _ _ +17-23 1615-1618 fur _ _ +17-24 1619-1623 Seed _ _ +17-25 1623-1624 , _ _ +17-26 1625-1628 are _ _ +17-27 1629-1632 NOW _ _ +17-28 1633-1640 SELLING _ _ +17-29 1641-1643 at _ _ +17-30 1644-1647 the _ _ +17-31 1648-1655 Potatoe _ _ +17-32 1656-1662 Stores _ _ +17-33 1663-1665 of _ _ +17-34 1666-1668 Mr _ _ +17-35 1668-1669 . _ _ + +#Text=JOHN BLUNDELL, Poole Arms Inn, Quay, Poole, at Bs. per cwt. or le. 4.1. per peck. +18-1 1670-1674 JOHN _ _ +18-2 1675-1683 BLUNDELL _ _ +18-3 1683-1684 , _ _ +18-4 1685-1690 Poole https://en.wikipedia.org/wiki/Poole LOC +18-5 1691-1695 Arms *[2] BUILDING[2] +18-6 1696-1699 Inn *[2] BUILDING[2] +18-7 1699-1700 , _ _ +18-8 1701-1705 Quay * STREET +18-9 1705-1706 , _ _ +18-10 1707-1712 Poole https://en.wikipedia.org/wiki/Poole LOC +18-11 1712-1713 , _ _ +18-12 1714-1716 at _ _ +18-13 1717-1719 Bs _ _ +18-14 1719-1720 . _ _ +18-15 1721-1724 per _ _ +18-16 1725-1728 cwt _ _ +18-17 1728-1729 . _ _ +18-18 1730-1732 or _ _ +18-19 1733-1735 le _ _ +18-20 1735-1736 . _ _ +18-21 1737-1740 4.1 _ _ +18-22 1740-1741 . _ _ +18-23 1742-1745 per _ _ +18-24 1746-1750 peck _ _ +18-25 1750-1751 . _ _ + +#Text=May Bth. 4e60. +19-1 1752-1755 May _ _ +19-2 1756-1759 Bth _ _ +19-3 1759-1760 . _ _ +19-4 1761-1765 4e60 _ _ +19-5 1765-1766 . _ _ diff --git a/tests/sample_files/resources/news_datasets/topRes19th_v2/test/metadata.tsv b/tests/sample_files/resources/news_datasets/topRes19th_v2/test/metadata.tsv new file mode 100644 index 00000000..c7e7f325 --- /dev/null +++ b/tests/sample_files/resources/news_datasets/topRes19th_v2/test/metadata.tsv @@ -0,0 +1,2 @@ +fname word_count ocr_quality_mean ocr_quality_sd issue_date publication_code publication_title decade place_publication annotation_batch +9144_Poole1860 319 0.9113 0.1226 1860-05-10 2325 The Poole and South-Western Herald, etc. 1860 Poole 3 diff --git a/tests/sample_files/resources/news_datasets/topRes19th_v2/train/.DS_Store b/tests/sample_files/resources/news_datasets/topRes19th_v2/train/.DS_Store new file mode 100644 index 00000000..72992317 Binary files /dev/null and b/tests/sample_files/resources/news_datasets/topRes19th_v2/train/.DS_Store differ diff --git a/tests/sample_files/resources/news_datasets/topRes19th_v2/train/annotated_tsv/1218_Poole1860.tsv b/tests/sample_files/resources/news_datasets/topRes19th_v2/train/annotated_tsv/1218_Poole1860.tsv new file mode 100644 index 00000000..c18c088a --- /dev/null +++ b/tests/sample_files/resources/news_datasets/topRes19th_v2/train/annotated_tsv/1218_Poole1860.tsv @@ -0,0 +1,488 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=webanno.custom.Customentity|identifiier|value + + +#Text=THE POOLE AND SOUTH-WESTERN HERALD, THURSDAY, OCTOBER 20, 1864. +1-1 0-3 THE _ _ +1-2 4-9 POOLE _ _ +1-3 10-13 AND _ _ +1-4 14-27 SOUTH-WESTERN _ _ +1-5 28-34 HERALD _ _ +1-6 34-35 , _ _ +1-7 36-44 THURSDAY _ _ +1-8 44-45 , _ _ +1-9 46-53 OCTOBER _ _ +1-10 54-56 20 _ _ +1-11 56-57 , _ _ +1-12 58-62 1864 _ _ +1-13 62-63 . _ _ + +#Text=POOLE TOWN COUNCIL. +2-1 65-70 POOLE https://en.wikipedia.org/wiki/Poole LOC +2-2 71-75 TOWN _ _ +2-3 76-83 COUNCIL _ _ +2-4 83-84 . _ _ + +#Text=On Friday last, a meeting of the Town Cowmen was held at the Guildhall. the Mayor (G. +3-1 86-88 On _ _ +3-2 89-95 Friday _ _ +3-3 96-100 last _ _ +3-4 100-101 , _ _ +3-5 102-103 a _ _ +3-6 104-111 meeting _ _ +3-7 112-114 of _ _ +3-8 115-118 the _ _ +3-9 119-123 Town _ _ +3-10 124-130 Cowmen _ _ +3-11 131-134 was _ _ +3-12 135-139 held _ _ +3-13 140-142 at _ _ +3-14 143-146 the _ _ +3-15 147-156 Guildhall * BUILDING +3-16 156-157 . _ _ +3-17 158-161 the _ _ +3-18 162-167 Mayor _ _ +3-19 168-169 ( _ _ +3-20 169-170 G _ _ +3-21 170-171 . _ _ + +#Text=Belben, jun. +4-1 172-178 Belben _ _ +4-2 178-179 , _ _ +4-3 180-183 jun _ _ +4-4 183-184 . _ _ + +#Text=Esq.,) in the chair. +5-1 185-188 Esq _ _ +5-2 188-189 . _ _ +5-3 189-190 , _ _ +5-4 190-191 ) _ _ +5-5 192-194 in _ _ +5-6 195-198 the _ _ +5-7 199-204 chair _ _ +5-8 204-205 . _ _ + +#Text=There were also present: Ald J. +6-1 206-211 There _ _ +6-2 212-216 were _ _ +6-3 217-221 also _ _ +6-4 222-229 present _ _ +6-5 229-230 : _ _ +6-6 231-234 Ald _ _ +6-7 235-236 J _ _ +6-8 236-237 . _ _ + +#Text=Adey, J. +7-1 238-242 Adey _ _ +7-2 242-243 , _ _ +7-3 244-245 J _ _ +7-4 245-246 . _ _ + +#Text=Gomm, and W. +8-1 247-251 Gomm _ _ +8-2 251-252 , _ _ +8-3 253-256 and _ _ +8-4 257-258 W _ _ +8-5 258-259 . _ _ + +#Text=Pearce, and Councillel Frampton, J. +9-1 260-266 Pearce _ _ +9-2 266-267 , _ _ +9-3 268-271 and _ _ +9-4 272-282 Councillel _ _ +9-5 283-291 Frampton _ _ +9-6 291-292 , _ _ +9-7 293-294 J _ _ +9-8 294-295 . _ _ + +#Text=Harker, H. +10-1 296-302 Harker _ _ +10-2 302-303 , _ _ +10-3 304-305 H _ _ +10-4 305-306 . _ _ + +#Text=Harris, F. +11-1 307-313 Harris _ _ +11-2 313-314 , _ _ +11-3 315-316 F _ _ +11-4 316-317 . _ _ + +#Text=Btyring. and Wood. +12-1 318-325 Btyring _ _ +12-2 325-326 . _ _ +12-3 327-330 and _ _ +12-4 331-335 Wood _ _ +12-5 335-336 . _ _ + +#Text=The TOWN CLERK having read the minutes of the last meeting,, orders were made upon the borough treasurer to the amount of £SB Bs. +13-1 337-340 The _ _ +13-2 341-345 TOWN _ _ +13-3 346-351 CLERK _ _ +13-4 352-358 having _ _ +13-5 359-363 read _ _ +13-6 364-367 the _ _ +13-7 368-375 minutes _ _ +13-8 376-378 of _ _ +13-9 379-382 the _ _ +13-10 383-387 last _ _ +13-11 388-395 meeting _ _ +13-12 395-396 , _ _ +13-13 396-397 , _ _ +13-14 398-404 orders _ _ +13-15 405-409 were _ _ +13-16 410-414 made _ _ +13-17 415-419 upon _ _ +13-18 420-423 the _ _ +13-19 424-431 borough _ _ +13-20 432-441 treasurer _ _ +13-21 442-444 to _ _ +13-22 445-448 the _ _ +13-23 449-455 amount _ _ +13-24 456-458 of _ _ +13-25 459-460 £ _ _ +13-26 460-462 SB _ _ +13-27 463-465 Bs _ _ +13-28 465-466 . _ _ + +#Text=Id. +14-1 467-469 Id _ _ +14-2 469-470 . _ _ + +#Text=Dimmers AND FAIRS. +15-1 472-479 Dimmers _ _ +15-2 480-483 AND _ _ +15-3 484-489 FAIRS _ _ +15-4 489-490 . _ _ + +#Text=The next leceincas was to receive the tenders for taking the markets and fairs from the 25th of December next, and to declare the accepted tender. +16-1 491-494 The _ _ +16-2 495-499 next _ _ +16-3 500-509 leceincas _ _ +16-4 510-513 was _ _ +16-5 514-516 to _ _ +16-6 517-524 receive _ _ +16-7 525-528 the _ _ +16-8 529-536 tenders _ _ +16-9 537-540 for _ _ +16-10 541-547 taking _ _ +16-11 548-551 the _ _ +16-12 552-559 markets _ _ +16-13 560-563 and _ _ +16-14 564-569 fairs _ _ +16-15 570-574 from _ _ +16-16 575-578 the _ _ +16-17 579-583 25th _ _ +16-18 584-586 of _ _ +16-19 587-595 December _ _ +16-20 596-600 next _ _ +16-21 600-601 , _ _ +16-22 602-605 and _ _ +16-23 606-608 to _ _ +16-24 609-616 declare _ _ +16-25 617-620 the _ _ +16-26 621-629 accepted _ _ +16-27 630-636 tender _ _ +16-28 636-637 . _ _ + +#Text=Mr. +17-1 638-640 Mr _ _ +17-2 640-641 . _ _ + +#Text=William Gould, the late knee, offered £7O for the next year ; Mr. +18-1 642-649 William _ _ +18-2 650-655 Gould _ _ +18-3 655-656 , _ _ +18-4 657-660 the _ _ +18-5 661-665 late _ _ +18-6 666-670 knee _ _ +18-7 670-671 , _ _ +18-8 672-679 offered _ _ +18-9 680-683 £7O _ _ +18-10 684-687 for _ _ +18-11 688-691 the _ _ +18-12 692-696 next _ _ +18-13 697-701 year _ _ +18-14 702-703 ; _ _ +18-15 704-706 Mr _ _ +18-16 706-707 . _ _ + +#Text=Cary, of Ramsey, Hants, sent in a tender offering £lO more than Mr. +19-1 708-712 Cary _ _ +19-2 712-713 , _ _ +19-3 714-716 of _ _ +19-4 717-723 Ramsey https://en.wikipedia.org/wiki/Romsey LOC +19-5 723-724 , _ _ +19-6 725-730 Hants https://en.wikipedia.org/wiki/Hampshire LOC +19-7 730-731 , _ _ +19-8 732-736 sent _ _ +19-9 737-739 in _ _ +19-10 740-741 a _ _ +19-11 742-748 tender _ _ +19-12 749-757 offering _ _ +19-13 758-759 £ _ _ +19-14 759-761 lO _ _ +19-15 762-766 more _ _ +19-16 767-771 than _ _ +19-17 772-774 Mr _ _ +19-18 774-775 . _ _ + +#Text=Gould, a proposal which elicited some laughter and Mr. +20-1 776-781 Gould _ _ +20-2 781-782 , _ _ +20-3 783-784 a _ _ +20-4 785-793 proposal _ _ +20-5 794-799 which _ _ +20-6 800-808 elicited _ _ +20-7 809-813 some _ _ +20-8 814-822 laughter _ _ +20-9 823-826 and _ _ +20-10 827-829 Mr _ _ +20-11 829-830 . _ _ + +#Text=Gooden, of Poole, offered £95. +21-1 831-837 Gooden _ _ +21-2 837-838 , _ _ +21-3 839-841 of _ _ +21-4 842-847 Poole https://en.wikipedia.org/wiki/Poole LOC +21-5 847-848 , _ _ +21-6 849-856 offered _ _ +21-7 857-860 £95 _ _ +21-8 860-861 . _ _ + +#Text=After a little conversation Mr. +22-1 862-867 After _ _ +22-2 868-869 a _ _ +22-3 870-876 little _ _ +22-4 877-889 conversation _ _ +22-5 890-892 Mr _ _ +22-6 892-893 . _ _ + +#Text=Wood proposed. and Mr. +23-1 894-898 Wood _ _ +23-2 899-907 proposed _ _ +23-3 907-908 . _ _ +23-4 909-912 and _ _ +23-5 913-915 Mr _ _ +23-6 915-916 . _ _ + +#Text=Styring seconded the motion that Mr. +24-1 917-924 Styring _ _ +24-2 925-933 seconded _ _ +24-3 934-937 the _ _ +24-4 938-944 motion _ _ +24-5 945-949 that _ _ +24-6 950-952 Mr _ _ +24-7 952-953 . _ _ + +#Text=Goodeds tender be received for one year. +25-1 954-961 Goodeds _ _ +25-2 962-968 tender _ _ +25-3 969-971 be _ _ +25-4 972-980 received _ _ +25-5 981-984 for _ _ +25-6 985-988 one _ _ +25-7 989-993 year _ _ +25-8 993-994 . _ _ + +#Text=UNOCCUPIED LAND TO LET. +26-1 995-1005 UNOCCUPIED _ _ +26-2 1006-1010 LAND _ _ +26-3 1011-1013 TO _ _ +26-4 1014-1017 LET _ _ +26-5 1017-1018 . _ _ + +#Text=The TOWN CLERK read • letter from Mr. +27-1 1019-1022 The _ _ +27-2 1023-1027 TOWN _ _ +27-3 1028-1033 CLERK _ _ +27-4 1034-1038 read _ _ +27-5 1039-1040 • _ _ +27-6 1041-1047 letter _ _ +27-7 1048-1052 from _ _ +27-8 1053-1055 Mr _ _ +27-9 1055-1056 . _ _ + +#Text=James Davis, offering two guineas per annum for the right of sporting over certain land at Kinson. +28-1 1057-1062 James _ _ +28-2 1063-1068 Davis _ _ +28-3 1068-1069 , _ _ +28-4 1070-1078 offering _ _ +28-5 1079-1082 two _ _ +28-6 1083-1090 guineas _ _ +28-7 1091-1094 per _ _ +28-8 1095-1100 annum _ _ +28-9 1101-1104 for _ _ +28-10 1105-1108 the _ _ +28-11 1109-1114 right _ _ +28-12 1115-1117 of _ _ +28-13 1118-1126 sporting _ _ +28-14 1127-1131 over _ _ +28-15 1132-1139 certain _ _ +28-16 1140-1144 land _ _ +28-17 1145-1147 at _ _ +28-18 1148-1154 Kinson https://en.wikipedia.org/wiki/Kinson LOC +28-19 1154-1155 . _ _ + +#Text=Some conversation ensued relative to the land in question, Mr. +29-1 1156-1160 Some _ _ +29-2 1161-1173 conversation _ _ +29-3 1174-1180 ensued _ _ +29-4 1181-1189 relative _ _ +29-5 1190-1192 to _ _ +29-6 1193-1196 the _ _ +29-7 1197-1201 land _ _ +29-8 1202-1204 in _ _ +29-9 1205-1213 question _ _ +29-10 1213-1214 , _ _ +29-11 1215-1217 Mr _ _ +29-12 1217-1218 . _ _ + +#Text=STYRING remarking that if he were allowed a little time to consider the matter, he should probably make an offer to take the land for a term of years. +30-1 1219-1226 STYRING _ _ +30-2 1227-1236 remarking _ _ +30-3 1237-1241 that _ _ +30-4 1242-1244 if _ _ +30-5 1245-1247 he _ _ +30-6 1248-1252 were _ _ +30-7 1253-1260 allowed _ _ +30-8 1261-1262 a _ _ +30-9 1263-1269 little _ _ +30-10 1270-1274 time _ _ +30-11 1275-1277 to _ _ +30-12 1278-1286 consider _ _ +30-13 1287-1290 the _ _ +30-14 1291-1297 matter _ _ +30-15 1297-1298 , _ _ +30-16 1299-1301 he _ _ +30-17 1302-1308 should _ _ +30-18 1309-1317 probably _ _ +30-19 1318-1322 make _ _ +30-20 1323-1325 an _ _ +30-21 1326-1331 offer _ _ +30-22 1332-1334 to _ _ +30-23 1335-1339 take _ _ +30-24 1340-1343 the _ _ +30-25 1344-1348 land _ _ +30-26 1349-1352 for _ _ +30-27 1353-1354 a _ _ +30-28 1355-1359 term _ _ +30-29 1360-1362 of _ _ +30-30 1363-1368 years _ _ +30-31 1368-1369 . _ _ + +#Text=He was of opinion that the land might be brought into a state of cultivation. +31-1 1370-1372 He _ _ +31-2 1373-1376 was _ _ +31-3 1377-1379 of _ _ +31-4 1380-1387 opinion _ _ +31-5 1388-1392 that _ _ +31-6 1393-1396 the _ _ +31-7 1397-1401 land _ _ +31-8 1402-1407 might _ _ +31-9 1408-1410 be _ _ +31-10 1411-1418 brought _ _ +31-11 1419-1423 into _ _ +31-12 1424-1425 a _ _ +31-13 1426-1431 state _ _ +31-14 1432-1434 of _ _ +31-15 1435-1446 cultivation _ _ +31-16 1446-1447 . _ _ + +#Text=The TOWN CLERK thought it would be very unwise to let the laud for a long term, as its value might before long be greatly enhanced. +32-1 1448-1451 The _ _ +32-2 1452-1456 TOWN _ _ +32-3 1457-1462 CLERK _ _ +32-4 1463-1470 thought _ _ +32-5 1471-1473 it _ _ +32-6 1474-1479 would _ _ +32-7 1480-1482 be _ _ +32-8 1483-1487 very _ _ +32-9 1488-1494 unwise _ _ +32-10 1495-1497 to _ _ +32-11 1498-1501 let _ _ +32-12 1502-1505 the _ _ +32-13 1506-1510 laud _ _ +32-14 1511-1514 for _ _ +32-15 1515-1516 a _ _ +32-16 1517-1521 long _ _ +32-17 1522-1526 term _ _ +32-18 1526-1527 , _ _ +32-19 1528-1530 as _ _ +32-20 1531-1534 its _ _ +32-21 1535-1540 value _ _ +32-22 1541-1546 might _ _ +32-23 1547-1553 before _ _ +32-24 1554-1558 long _ _ +32-25 1559-1561 be _ _ +32-26 1562-1569 greatly _ _ +32-27 1570-1578 enhanced _ _ +32-28 1578-1579 . _ _ + +#Text=He alluded particularly to the promised prosperity of Weabonrne, in which be had great faith. +33-1 1580-1582 He _ _ +33-2 1583-1590 alluded _ _ +33-3 1591-1603 particularly _ _ +33-4 1604-1606 to _ _ +33-5 1607-1610 the _ _ +33-6 1611-1619 promised _ _ +33-7 1620-1630 prosperity _ _ +33-8 1631-1633 of _ _ +33-9 1634-1643 Weabonrne https://en.wikipedia.org/wiki/Westbourne,\_Dorset LOC +33-10 1643-1644 , _ _ +33-11 1645-1647 in _ _ +33-12 1648-1653 which _ _ +33-13 1654-1656 be _ _ +33-14 1657-1660 had _ _ +33-15 1661-1666 great _ _ +33-16 1667-1672 faith _ _ +33-17 1672-1673 . _ _ + +#Text=The MAYOR advised that the land should be advertised to be let This suggestion was adopted, and the Town Clerk was directed to advertise the land in question to be let by tender or otherwise. +34-1 1674-1677 The _ _ +34-2 1678-1683 MAYOR _ _ +34-3 1684-1691 advised _ _ +34-4 1692-1696 that _ _ +34-5 1697-1700 the _ _ +34-6 1701-1705 land _ _ +34-7 1706-1712 should _ _ +34-8 1713-1715 be _ _ +34-9 1716-1726 advertised _ _ +34-10 1727-1729 to _ _ +34-11 1730-1732 be _ _ +34-12 1733-1736 let _ _ +34-13 1737-1741 This _ _ +34-14 1742-1752 suggestion _ _ +34-15 1753-1756 was _ _ +34-16 1757-1764 adopted _ _ +34-17 1764-1765 , _ _ +34-18 1766-1769 and _ _ +34-19 1770-1773 the _ _ +34-20 1774-1778 Town _ _ +34-21 1779-1784 Clerk _ _ +34-22 1785-1788 was _ _ +34-23 1789-1797 directed _ _ +34-24 1798-1800 to _ _ +34-25 1801-1810 advertise _ _ +34-26 1811-1814 the _ _ +34-27 1815-1819 land _ _ +34-28 1820-1822 in _ _ +34-29 1823-1831 question _ _ +34-30 1832-1834 to _ _ +34-31 1835-1837 be _ _ +34-32 1838-1841 let _ _ +34-33 1842-1844 by _ _ +34-34 1845-1851 tender _ _ +34-35 1852-1854 or _ _ +34-36 1855-1864 otherwise _ _ +34-37 1864-1865 . _ _ + +#Text=There was no other business before the Council. +35-1 1866-1871 There _ _ +35-2 1872-1875 was _ _ +35-3 1876-1878 no _ _ +35-4 1879-1884 other _ _ +35-5 1885-1893 business _ _ +35-6 1894-1900 before _ _ +35-7 1901-1904 the _ _ +35-8 1905-1912 Council _ _ +35-9 1912-1913 . _ _ diff --git a/tests/sample_files/resources/news_datasets/topRes19th_v2/train/metadata.tsv b/tests/sample_files/resources/news_datasets/topRes19th_v2/train/metadata.tsv new file mode 100644 index 00000000..1250256a --- /dev/null +++ b/tests/sample_files/resources/news_datasets/topRes19th_v2/train/metadata.tsv @@ -0,0 +1,2 @@ +fname word_count ocr_quality_mean ocr_quality_sd issue_date publication_code publication_title decade place_publication annotation_batch +1218_Poole1860 347 0.8804 0.1506 1864-10-20 2325 The Poole and South-Western Herald, etc. 1860 Poole 2 diff --git a/tests/sample_files/resources/publication_metadata.json b/tests/sample_files/resources/publication_metadata.json new file mode 100644 index 00000000..a9daace2 --- /dev/null +++ b/tests/sample_files/resources/publication_metadata.json @@ -0,0 +1,163 @@ +{ + "sn83030483": { + "publication_title": "Gazette of the United-States", + "publication_place": "New York", + "publication_ctxt": "New York", + "publication_dates": "1789-1793", + "wikidata_qid": "Q60" + }, + "sn84026272": { + "publication_title": "Gazette of the United-States", + "publication_place": "Philadelphia", + "publication_ctxt": "Pennsylvania", + "publication_dates": "1800-1801", + "wikidata_qid": "Q1345" + }, + "sn82014385": { + "publication_title": "The Delaware gazette", + "publication_place": "Wilmington", + "publication_ctxt": "Delaware", + "publication_dates": "1809-1810", + "wikidata_qid": "Q174224" + }, + "sn83026170": { + "publication_title": "Alexandria Gazette", + "publication_place": "Alexandria", + "publication_ctxt": "Virginia", + "publication_dates": "1817-1822", + "wikidata_qid": "Q88" + }, + "sn83020874": { + "publication_title": "Cherokee Phoenix, and Indian's advocate", + "publication_place": "Echota", + "publication_ctxt": "Georgia", + "publication_dates": "1829-1834", + "wikidata_qid": "Q7007061" + }, + "sn84020750": { + "publication_title": "The North Carolinian", + "publication_place": "Fayetteville", + "publication_ctxt": "North Carolina", + "publication_dates": "1839-1861", + "wikidata_qid": "Q331104" + }, + "sn85042404": { + "publication_title": "Jamestown Alert", + "publication_place": "Jamestown", + "publication_ctxt": "North Dakota", + "publication_dates": "1878-1882", + "wikidata_qid": "Q1052658" + }, + "sn88068010": { + "publication_title": "Chariton Courier", + "publication_place": "Keytesville", + "publication_ctxt": "Missouri", + "publication_dates": "1878-current", + "wikidata_qid": "Q957297" + }, + "sn86063397": { + "publication_title": "The Elk Mountain pilot", + "publication_place": "Irwin", + "publication_ctxt": "Colorado", + "publication_dates": "1880-19??", + "wikidata_qid": "Q592729" + }, + "sn88085488": { + "publication_title": "Pullman Herald", + "publication_place": "Pullman", + "publication_ctxt": "Washington", + "publication_dates": "1888-1989", + "wikidata_qid": "Q983540" + }, + "sn89058133": { + "publication_title": "Putnam County Herald", + "publication_place": "Cookeville", + "publication_ctxt": "Tennessee", + "publication_dates": "1903-1922", + "wikidata_qid": "Q2456192" + }, + "sn83025812": { + "publication_title": "The Independent", + "publication_place": "Elizabeth City", + "publication_ctxt": "North Carolina", + "publication_dates": "1908-1936", + "wikidata_qid": "Q1018467" + }, + "sn92063852": { + "publication_title": "The Detroit Tribune", + "publication_place": "Detroit", + "publication_ctxt": "Michigan", + "publication_dates": "1935-1966", + "wikidata_qid": "Q12439" + }, + "sn91068761": { + "publication_title": "Tabor City Tribune", + "publication_place": "Tabor City", + "publication_ctxt": "North Carolina", + "publication_dates": "1946-1991", + "wikidata_qid": "Q586130" + }, + "0000408": { + "publication_title": "Dorset County Chronicle", + "publication_place": "Dorchester", + "publication_ctxt": "Dorset", + "publication_dates": "1824-1884", + "wikidata_qid": "Q503331" + }, + "0000206": { + "publication_title": "Manchester Courier and Lancashire General Advertiser.", + "publication_place": "Manchester", + "publication_ctxt": "Lancashire", + "publication_dates": "1825-1916", + "wikidata_qid": "Q18125" + }, + "0000968": { + "publication_title": "The Ashton Weekly Reporter, and Stalybridge and Dukinfield Chronicle", + "publication_place": "Ashton-under-Lyne", + "publication_ctxt": "Lancashire", + "publication_dates": "1855-", + "wikidata_qid": "Q659803" + }, + "0000200": { + "publication_title": "The Manchester Mercury", + "publication_place": "Manchester", + "publication_ctxt": "Lancashire", + "publication_dates": "1752-1830", + "wikidata_qid": "Q18125" + }, + "0000201": { + "publication_title": "The Manchester Mercury", + "publication_place": "Manchester", + "publication_ctxt": "Lancashire", + "publication_dates": "1752-1830", + "wikidata_qid": "Q18125" + }, + "0000239": { + "publication_title": "The Manchester Mercury", + "publication_place": "Manchester", + "publication_ctxt": "Lancashire", + "publication_dates": "1752-1830", + "wikidata_qid": "Q18125" + }, + "0000240": { + "publication_title": "The Manchester Mercury", + "publication_place": "Manchester", + "publication_ctxt": "Lancashire", + "publication_dates": "1752-1830", + "wikidata_qid": "Q18125" + }, + "0000967": { + "publication_title": "Ashton and Stalybridge Reporter", + "publication_place": "Ashton-under-Lyne", + "publication_ctxt": "Lancashire", + "publication_dates": "1855-", + "wikidata_qid": "Q659803" + }, + "0002325": { + "publication_title": "The Poole and South-Western Herald", + "publication_place": "Poole", + "publication_ctxt": "Dorset", + "publication_dates": "1852-1889", + "wikidata_qid": "Q203349" + } +} diff --git a/tests/sample_files/resources/wikidata/entity2class.txt b/tests/sample_files/resources/wikidata/entity2class.txt new file mode 100644 index 00000000..ba7ec9bb --- /dev/null +++ b/tests/sample_files/resources/wikidata/entity2class.txt @@ -0,0 +1 @@ +{"Q8577": "Q159821", "Q79348": "Q1093829", "Q83609": "Q3957", "Q170027": "Q3918", "Q482468": "Q15127012", "Q503516": "Q13410447", "Q547824": "Q852190", "Q729177": "Q570116", "Q801124": "Q55488", "Q871138": "Q3146899", "Q985210": "Q55488", "Q1545354": "Q44782", "Q1950928": "Q2154459", "Q2018322": "Q79007", "Q2460124": "Q532", "Q5645763": "Q494829", "Q7492719": "Q3252927", "Q7746609": "Q811979", "Q14875251": "Q4204495", "Q22059065": "Q17343829", "Q108940076": "Q498162", "Q8712": "Q94993988", "Q20075": "Q5503", "Q23311": "Q515", "Q42182": "Q570116", "Q122744": "Q2755753", "Q130206": "Q537127", "Q219867": "Q55488", "Q503424": "Q5341295", "Q734547": "Q7631958", "Q741640": "Q202570", "Q772421": "Q494230", "Q795678": "Q55488", "Q927198": "Q2202509", "Q1137312": "Q19953632", "Q1359589": "Q11635", "Q1394500": "Q7631958", "Q1399178": "Q32815", "Q1415441": "Q1248784", "Q1988417": "Q123705", "Q2354215": "Q82794", "Q3577611": "Q105731", "Q6669870": "Q11483816", "Q7443327": "Q2380335", "Q7492567": "Q56885635", "Q7492775": "Q9035798", "Q7492778": "Q55488", "Q15242653": "Q33506", "Q27985411": "Q123705", "Q8699": "Q94993988", "Q26888": "Q7897276", "Q209266": "Q15127012", "Q220198": "Q45400320", "Q518864": "Q751708", "Q774015": "Q1002812", "Q835031": "Q3917681", "Q897533": "Q1154710", "Q1431914": "Q644371", "Q1466941": "Q55488", "Q1749569": "Q2940297", "Q2306176": "Q15127012", "Q2365261": "Q1093829", "Q3061911": "Q1093829", "Q7492607": "Q738570", "Q7721041": "Q123705", "Q17509255": "Q79007", "Q17643392": "Q811979", "Q8703": "Q94993988", "Q42448": "Q515", "Q79869": "Q1093829", "Q123738": "Q22698", "Q193196": "Q4671277", "Q194209": "Q26132862", "Q649419": "Q55488", "Q733210": "Q26132862", "Q1184547": "Q498162", "Q1449564": "Q55488", "Q1666958": "Q59861107", "Q2277715": "Q3957", "Q2477346": "Q486972", "Q3228965": "Q383092", "Q4763489": "Q18917976", "Q4834838": "Q14350", "Q5038252": "Q486972", "Q5338273": "Q476028", "Q6515927": "Q618123", "Q6670323": "Q19953632", "Q24896243": "Q483110", "Q8709": "Q94993988", "Q10818": "Q217327", "Q23306": "Q180673", "Q123885": "Q45400320", "Q279459": "Q494829", "Q489255": "Q1093829", "Q951830": "Q464780", "Q1001456": "Q1093829", "Q1323689": "Q220505", "Q1488404": "Q811979", "Q1984238": "Q751708", "Q6515866": "Q55488", "Q6669759": "Q123705", "Q6671078": "Q938381", "Q6900329": "Q2380335", "Q7492566": "Q3257686", "Q84": "Q515", "Q8111": "Q159821", "Q55018": "Q24354", "Q124234": "Q2755753", "Q160302": "Q45400320", "Q195436": "Q207694", "Q238587": "Q207694", "Q278054": "Q3146899", "Q720102": "Q55488", "Q756819": "Q79007", "Q795691": "Q55485", "Q800751": "Q55488", "Q800753": "Q55488", "Q1137962": "Q868557", "Q4523493": "Q702492", "Q5011830": "Q14350", "Q5177618": "Q1137272", "Q6515805": "Q17343829", "Q6515934": "Q494829", "Q7492686": "Q494829", "Q14710970": "Q17343829", "Q60578265": "Q27990982", "Q79568": "Q1093829", "Q148349": "Q2755753", "Q205679": "Q7897276", "Q212883": "Q26132862", "Q214788": "Q55488", "Q216185": "Q4989906", "Q565521": "Q1802963", "Q1128631": "Q476028", "Q1187032": "Q1076486", "Q1402606": "Q842402", "Q2716505": "Q2755753", "Q3028626": "Q18917976", "Q3461415": "Q17343829", "Q4642035": "Q41176", "Q4801470": "Q2418495", "Q4834918": "Q14350", "Q4871546": "Q178561", "Q7594521": "Q1088552", "Q15179170": "Q5367899", "Q8691": "Q94993988", "Q8982": "Q1248784", "Q23298": "Q180673", "Q39121": "Q515", "Q92561": "Q515", "Q171240": "Q11691", "Q186309": "Q667018", "Q578794": "Q18608583", "Q743535": "Q1115575", "Q746876": "Q23413", "Q801125": "Q55488", "Q823917": "Q5341295", "Q1862179": "Q55488", "Q2422792": "Q1907114", "Q3365926": "Q486972", "Q4834926": "Q14350", "Q6669738": "Q17343829", "Q7242790": "Q132241", "Q7492565": "Q532", "Q7492568": "Q17343829", "Q7492570": "Q17343829", "Q12956644": "Q1002812", "Q14946379": "Q18917976", "Q20657974": "Q17343829", "Q21061609": "Q483110"} \ No newline at end of file diff --git a/tests/sample_files/resources/wikidata/mentions_to_wikidata.json b/tests/sample_files/resources/wikidata/mentions_to_wikidata.json new file mode 100644 index 00000000..c4fa7b81 --- /dev/null +++ b/tests/sample_files/resources/wikidata/mentions_to_wikidata.json @@ -0,0 +1,183 @@ +{ + "Edinburgh University": { + "Q160302": 202, + "Q5338273": 3 + }, + "London": { + "Q170027": 49, + "Q84": 15091, + "Q800751": 16, + "Q214788": 9, + "Q1488404": 2, + "Q14946379": 20, + "Q23311": 58, + "Q6900329": 6, + "Q92561": 288, + "Q171240": 13, + "Q279459": 5, + "Q795691": 2, + "Q734547": 1, + "Q2477346": 11, + "Q3061911": 27, + "Q8577": 6, + "Q1137312": 12, + "Q6670323": 2, + "Q8691": 23, + "Q1545354": 18, + "Q338466": 4, + "Q1988417": 4, + "Q578794": 6, + "Q1415441": 1, + "Q8111": 2, + "Q6669759": 6, + "Q985210": 6, + "Q219867": 9, + "Q795678": 1, + "Q7242790": 2, + "Q216185": 1, + "Q2018322": 1, + "Q720102": 6, + "Q23306": 39, + "Q42182": 1, + "Q1449564": 5, + "Q733210": 1, + "Q14710970": 8, + "Q2422792": 2, + "Q1001456": 17, + "Q503516": 2, + "Q8982": 1, + "Q22059065": 3, + "Q8712": 2, + "Q20657974": 4, + "Q565521": 1, + "Q238587": 1, + "Q2716505": 1, + "Q927198": 3, + "Q122744": 1, + "Q123738": 1, + "Q8703": 2, + "Q15179170": 1, + "Q10818": 2, + "Q1359589": 8, + "Q649419": 2, + "Q15242653": 1, + "Q20075": 6, + "Q6669738": 6, + "Q756819": 1, + "Q2354215": 2, + "Q7443327": 1, + "Q123885": 3, + "Q55018": 1, + "Q130206": 2, + "Q4642035": 1, + "Q729177": 2, + "Q1399178": 1, + "Q5645763": 1, + "Q194209": 1, + "Q801124": 6, + "Q7737135": 1, + "Q4834838": 1, + "Q17509255": 1, + "Q951830": 1, + "Q800753": 1, + "Q6671078": 3, + "Q186309": 1, + "Q148349": 1, + "Q212883": 1, + "Q195436": 1, + "Q5038252": 1, + "Q743535": 1, + "Q83609": 1, + "Q79348": 11, + "Q193196": 1, + "Q4801470": 1, + "Q220198": 1, + "Q124234": 1, + "Q23298": 1, + "Q1431914": 1, + "Q835031": 1, + "Q1323689": 2, + "Q7594521": 1, + "Q26888": 1, + "Q8709": 2, + "Q1402606": 1, + "Q278054": 1, + "Q801125": 1, + "Q205679": 1, + "Q6669870": 1, + "Q1666958": 1, + "Q5011830": 1, + "Q1394500": 1, + "Q772421": 1, + "Q1749569": 1, + "Q60578265": 2 + }, + "Leeds": { + "Q39121": 1503, + "Q1466941": 140, + "Q1128631": 21, + "Q774015": 18, + "Q503424": 30, + "Q1137962": 2, + "Q6515934": 3, + "Q4834918": 2, + "Q7721041": 1, + "Q482468": 14, + "Q2460124": 10, + "Q79869": 14, + "Q746876": 4, + "Q6515805": 4, + "Q3461415": 10, + "Q2365261": 15, + "Q21061609": 12, + "Q7746609": 1, + "Q14875251": 2, + "Q6515927": 1, + "Q5177618": 1, + "Q8699": 1, + "Q27985411": 1, + "Q6515866": 1, + "Q871138": 2, + "Q4763489": 1, + "Q4871546": 1, + "Q24896243": 1, + "Q1187032": 3, + "Q489255": 2, + "Q3228965": 1, + "Q209266": 8 + }, + "Sheffield": { + "Q42448": 1288, + "Q7492778": 10, + "Q7492565": 6, + "Q1862179": 116, + "Q823917": 24, + "Q4834926": 3, + "Q17643392": 1, + "Q2306176": 31, + "Q897533": 5, + "Q7492570": 5, + "Q1950928": 11, + "Q2277715": 11, + "Q79568": 16, + "Q518864": 8, + "Q7492591": 3, + "Q7492775": 1, + "Q741640": 1, + "Q7492686": 1, + "Q3577611": 1, + "Q12956644": 11, + "Q547824": 1, + "Q7492719": 1, + "Q7492566": 6, + "Q7492567": 3, + "Q4523493": 1, + "Q3028626": 1, + "Q7492607": 1, + "Q3365926": 4, + "Q7492568": 4, + "Q108940076": 3, + "Q1184547": 9, + "Q1984238": 8 + } +} \ No newline at end of file diff --git a/tests/sample_files/resources/wikidata/mentions_to_wikidata_normalized.json b/tests/sample_files/resources/wikidata/mentions_to_wikidata_normalized.json new file mode 100644 index 00000000..22c19c13 --- /dev/null +++ b/tests/sample_files/resources/wikidata/mentions_to_wikidata_normalized.json @@ -0,0 +1,183 @@ +{ + "Edinburgh University": { + "Q160302": 0.12235009085402786, + "Q5338273": 0.75 + }, + "London": { + "Q170027": 0.04149026248941575, + "Q84": 0.9822311897943244, + "Q800751": 0.042105263157894736, + "Q214788": 0.02122641509433962, + "Q1488404": 0.058823529411764705, + "Q14946379": 0.18518518518518517, + "Q23311": 0.033603707995365, + "Q6900329": 0.007211538461538462, + "Q92561": 0.3769633507853403, + "Q171240": 0.01906158357771261, + "Q279459": 0.07936507936507936, + "Q795691": 0.0045045045045045045, + "Q734547": 0.0028089887640449437, + "Q2477346": 0.8461538461538463, + "Q3061911": 0.6923076923076923, + "Q8577": 0.00495458298926507, + "Q1137312": 0.058252427184466014, + "Q6670323": 0.2222222222222222, + "Q8691": 0.026713124274099883, + "Q1545354": 0.18181818181818182, + "Q338466": 0.13333333333333333, + "Q1988417": 0.0975609756097561, + "Q578794": 0.03870967741935484, + "Q1415441": 0.014285714285714285, + "Q8111": 0.007352941176470588, + "Q6669759": 1.0, + "Q985210": 0.01759530791788856, + "Q219867": 0.025423728813559324, + "Q795678": 0.03333333333333333, + "Q7242790": 0.06896551724137931, + "Q216185": 0.002421307506053269, + "Q2018322": 0.03225806451612903, + "Q720102": 0.015789473684210527, + "Q23306": 0.04216216216216216, + "Q42182": 0.0010256410256410256, + "Q1449564": 0.7142857142857142, + "Q733210": 0.09090909090909091, + "Q14710970": 0.7272727272727273, + "Q2422792": 0.025974025974025976, + "Q1001456": 0.6071428571428571, + "Q503516": 0.07142857142857142, + "Q8982": 0.007352941176470588, + "Q22059065": 1.0, + "Q8712": 0.012121212121212121, + "Q20657974": 1.0, + "Q565521": 0.012345679012345678, + "Q238587": 0.0026595744680851063, + "Q2716505": 0.017857142857142856, + "Q927198": 0.01948051948051948, + "Q122744": 0.007462686567164179, + "Q123738": 0.0017667844522968198, + "Q8703": 0.0047169811320754715, + "Q15179170": 0.14285714285714285, + "Q10818": 0.008097165991902834, + "Q1359589": 0.004781829049611476, + "Q649419": 0.013157894736842105, + "Q15242653": 0.06666666666666667, + "Q20075": 0.004062288422477996, + "Q6669738": 1.0, + "Q756819": 0.003125, + "Q2354215": 0.004140786749482402, + "Q7443327": 0.0625, + "Q123885": 0.001547987616099071, + "Q55018": 0.0012531328320802004, + "Q130206": 0.007272727272727273, + "Q4642035": 0.16666666666666666, + "Q729177": 0.11764705882352941, + "Q1399178": 0.125, + "Q5645763": 0.14285714285714285, + "Q194209": 0.06666666666666667, + "Q801124": 0.015463917525773196, + "Q7737135": 0.16666666666666666, + "Q4834838": 0.006211180124223602, + "Q17509255": 0.09090909090909091, + "Q951830": 0.0041841004184100415, + "Q800753": 0.01020408163265306, + "Q6671078": 0.12, + "Q186309": 0.005813953488372093, + "Q148349": 0.0038314176245210726, + "Q212883": 0.3333333333333333, + "Q195436": 0.004366812227074236, + "Q5038252": 1.0, + "Q743535": 0.0017699115044247787, + "Q83609": 0.008, + "Q79348": 1.0, + "Q193196": 0.0008116883116883117, + "Q4801470": 0.024390243902439025, + "Q220198": 0.0055248618784530384, + "Q124234": 0.009615384615384616, + "Q23298": 0.0004050222762251924, + "Q1431914": 0.011904761904761904, + "Q835031": 0.0625, + "Q1323689": 0.016666666666666666, + "Q7594521": 0.16666666666666666, + "Q26888": 0.006756756756756757, + "Q8709": 0.007751937984496124, + "Q1402606": 0.0625, + "Q278054": 0.14285714285714285, + "Q801125": 0.005291005291005291, + "Q205679": 0.0036900369003690036, + "Q6669870": 0.05555555555555555, + "Q1666958": 0.03571428571428571, + "Q5011830": 0.25, + "Q1394500": 0.0026595744680851063, + "Q772421": 0.025, + "Q1749569": 0.16666666666666666, + "Q60578265": 0.2857142857142857 + }, + "Leeds": { + "Q39121": 0.9868680236375573, + "Q1466941": 0.8484848484848485, + "Q1128631": 0.020114942528735632, + "Q774015": 0.20930232558139533, + "Q503424": 0.06382978723404255, + "Q1137962": 0.004310344827586207, + "Q6515934": 0.375, + "Q4834918": 0.06060606060606061, + "Q7721041": 1.0, + "Q482468": 1.0, + "Q2460124": 0.6666666666666666, + "Q79869": 0.5833333333333333, + "Q746876": 0.08333333333333333, + "Q6515805": 0.5, + "Q3461415": 1.0, + "Q2365261": 1.0, + "Q21061609": 0.058536585365853655, + "Q7746609": 0.05263157894736842, + "Q14875251": 0.2, + "Q6515927": 0.1111111111111111, + "Q5177618": 0.14285714285714285, + "Q8699": 0.013888888888888888, + "Q27985411": 1.0, + "Q6515866": 0.3333333333333333, + "Q871138": 0.15384615384615385, + "Q4763489": 0.029411764705882353, + "Q4871546": 1.0, + "Q24896243": 0.25, + "Q1187032": 0.06382978723404255, + "Q489255": 0.007462686567164179, + "Q3228965": 0.022727272727272728, + "Q209266": 0.6666666666666666 + }, + "Sheffield": { + "Q42448": 0.9401459854014598, + "Q7492778": 0.3448275862068966, + "Q7492565": 1.0, + "Q1862179": 0.7341772151898734, + "Q823917": 0.0851063829787234, + "Q4834926": 0.08571428571428572, + "Q17643392": 0.05263157894736842, + "Q2306176": 0.5740740740740741, + "Q897533": 0.03731343283582089, + "Q7492570": 1.0, + "Q1950928": 0.6470588235294118, + "Q2277715": 0.7857142857142857, + "Q79568": 0.48484848484848486, + "Q518864": 0.7272727272727273, + "Q7492591": 0.30000000000000004, + "Q7492775": 0.2, + "Q741640": 0.25, + "Q7492686": 0.125, + "Q3577611": 0.1, + "Q12956644": 0.34375, + "Q547824": 0.047619047619047616, + "Q7492719": 1.0, + "Q7492566": 1.0, + "Q7492567": 1.0, + "Q4523493": 0.125, + "Q3028626": 0.08333333333333333, + "Q7492607": 0.030303030303030304, + "Q3365926": 0.5714285714285714, + "Q7492568": 1.0, + "Q108940076": 0.75, + "Q1184547": 0.8181818181818182, + "Q1984238": 0.6153846153846154 + } +} \ No newline at end of file diff --git a/tests/sample_files/resources/wikidata/wikidata_gazetteer.csv b/tests/sample_files/resources/wikidata/wikidata_gazetteer.csv new file mode 100644 index 00000000..62f5ad99 --- /dev/null +++ b/tests/sample_files/resources/wikidata/wikidata_gazetteer.csv @@ -0,0 +1,174 @@ +wikidata_id,english_label,instance_of,alias_dict,nativelabel,hcounties,countries,latitude,longitude +Q160302,University of Edinburgh,"['Q875538', 'Q45400320', 'Q2667285']","{'pl': ['Uniwersytet Edynburski', 'Uniwersytet w Edynburgu', 'University of Edinburgh'], 'gd': ['Oilthigh Dhùn Éideann', 'Oilthigh Dùn Eideann', 'Oilthigh Dun Eideann', 'University of Edinburgh', 'Oilthigh Dhùn Eideann', 'Oilthigh Dhùn Èideann'], 'es': ['Universidad de Edinburgo', 'Universidad de Edimburgo la Escuela de Medicina', 'Universidad de Edimburgo, la Escuela de Medicina', 'Universidad de Edinburgh', 'Escuela de Medicina de la Universidad de Edimburgo', 'University of Edinburgh', 'Universidad de Edimburgo'], 'ga': ['Ollscoil Dún Éideann', 'Ollscoil Dhún Éideann'], 'nl': ['University of Edinburgh', 'Edinburgh University', 'Universiteit van Edinburgh'], 'pt': ['University of Edinburgh', 'Oilthigh Dhùn Èideann', 'Universidade de Edinburgh', 'Universidade de Edimburgo'], 'tr': ['University of Edinburgh', 'Edinburg Üniversitesi', 'Edinburgh Üniversitesi'], 'fr': [""Université d'Edimbourg"", ""Université d'Édinbourg"", ""L'Université d'Édimbourg"", ""Université d'Edinburgh"", 'Université d’Édimbourg', ""Universite d'Edimbourg"", 'University of Edinburgh', 'Edinburgh University', ""université d'Édimbourg""], 'it': ['University of Edinburgh', 'Università di Edimburgo'], 'de': ['Edinburgh University', 'Universität Edinburgh', 'Universität von Edinburgh'], 'ro': ['University of Edinburgh', 'Universitatea Edinburgh', 'Universitatea din Edinburgh'], 'en': ['Edinburgh University', 'The University of Edinburgh', 'University of Edinburgh'], 'uk': ['Единбурзький університет'], 'sco': ['Varsity o Edinburgh'], 'cy': ['Prifysgol Caeredin'], 'en-ca': ['University of Edinburgh'], 'en-gb': ['University of Edinburgh'], 'kw': ['Pennskol Karedin']}","['University of Edinburgh', 'University o Edinburgh', 'Oilthigh Dhùn Èideann']",['Q67317221'],"{'Q145': ('', '')}",55.947389,-3.187194 +Q5338273,Edinburgh University A.F.C.,['Q476028'],"{'en': ['Edinburgh University A.F.C.'], 'es': ['Edinburgh University A.F.C.'], 'nl': ['Edinburgh University A.F.C.'], 'fr': ['Edinburgh University Association Football Club']}",,['Q67317221'],"{'Q145': ('', '')}",55.93175,-3.149911 +Q170027,University of London,"['Q3918', 'Q38723', 'Q45400320', 'Q5341295']","{'pl': ['Uniwersytet Londyński', 'University of London'], 'es': ['University of London', 'Universidad de Londres'], 'nl': ['University of london', 'London University', 'Universiteit van Londen'], 'pt': ['University of London', 'University college london', 'Universidade de Londres'], 'tr': ['University of London', 'Londra Üniversitesi'], 'fr': ['Universite de Londres', 'University of London', 'université de Londres'], 'it': ['London University', 'University of London', 'Università di Londra'], 'de': ['Uni London', 'London University', 'Universität von London', 'University of London'], 'ro': ['University of London', 'Universitatea din Londra'], 'en': ['London University', 'Lond.', 'University of London'], 'gd': ['Oilthigh Lunnainn'], 'uk': ['Лондонський університет'], 'cy': ['Prifysgol Llundain'], 'sco': ['Varsity o Lunnon'], 'ga': ['Ollscoil Londan'], 'en-ca': ['University of London'], 'en-gb': ['University of London'], 'en-us': ['University of London']}",['University of London'],['Q19186'],"{'Q145': ('', '')}",51.521111,-0.128889 +Q84,London,"['Q200250', 'Q1066984', 'Q515', 'Q1637706', 'Q208511', 'Q5119', 'Q174844', 'Q51929311']","{'en-gb': ['London, UK', 'London, United Kingdom', 'London, England', 'London'], 'en': ['London, UK', 'London, United Kingdom', 'London, England', 'London UK', 'London U.K.', 'Greater London', 'Londinium', 'Loñ', 'Lundenwic', 'Londinio', 'Londini', 'Londiniensium', 'Augusta', 'Trinovantum', 'Kaerlud', 'Karelundein', 'Lunden', 'Big Smoke', 'the Big Smoke', 'Lundenburh', 'Lundenburgh', 'Llyn Dain', 'Llan Dian', 'Londinion', 'Loniniensi', 'Lon.', 'Loñ.', 'Lond.', 'London'], 'es': ['Londres (Reino Unido)', 'Londres (Inglaterra)', 'Greater London', 'London, UK', 'Londres'], 'fr': ['London', 'Londres'], 'nl': ['Londen, VK', 'Londen, Verenigd Koninkrijk', 'Londen, Engeland', 'Londen'], 'ga': ['Doirelondain', 'Londain'], 'pt': ['Londres, Reino Unido', 'Londres, Inglaterra', 'Londres, UK', 'Londres, GBR', 'Londres'], 'it': ['Londra'], 'pl': ['Londyn'], 'de': ['London'], 'en-ca': ['London'], 'sco': ['Lunnon'], 'cy': ['Llundain'], 'gd': ['Lunnainn'], 'kw': ['Loundres'], 'ro': ['Londra'], 'tr': ['Londra'], 'uk': ['Лондон']}","['London', 'Llundain', 'Lunnainn', 'Lunnon']","['Q19186', 'Q67443130', 'Q67479626', 'Q67442940', 'Q67532100']","{'Q2277': ('0047', '0410'), 'Q110888': ('0500', '0730'), 'Q105092': ('0730', '0918'), 'Q105313': ('0918', '0927'), 'Q179876': ('0927', '1707'), 'Q161885': ('1707', '1800'), 'Q174193': ('1801', '1922'), 'Q145': ('1922', '')}",51.507222,-0.1275 +Q800751,Euston Station,"['Q55488', 'Q55485']","{'pt': ['Estação de Euston (Metro de Londres)', 'Estação de Euston', 'Euston (Metropolitano de Londres)'], 'de': ['Euston station', 'Euston'], 'it': ['Stazione di Euston', 'stazione di Londra Euston'], 'en': ['London Euston', 'Euston railway station', 'Euston station', 'Euston Station'], 'es': ['Estacion de Euston', 'Estación de Euston'], 'ro': ['Londra Euston', 'Gara Londra Euston', 'gara Euston'], 'fr': ['Euston'], 'nl': ['Station London Euston'], 'cy': ['Gorsaf reilffordd Euston'], 'pl': ['Euston Station'], 'en-ca': ['Euston railway station'], 'en-gb': ['Euston railway station'], 'tr': ['Euston Tren İstasyonu'], 'uk': ['Юстон'], 'ga': ['Stáisiún Euston']}",,['Q19186'],"{'Q145': ('', '')}",51.5284,-0.1331 +Q214788,London Paddington station,"['Q55488', 'Q55485', 'Q1793804', 'Q20202072']","{'fr': ['gare de Paddington', 'gare de Londres Paddington', 'London Paddington'], 'es': ['Paddington', 'Estacion de Paddington', 'Estación de Paddington'], 'it': ['Stazione di Paddington', 'stazione di Londra Paddington'], 'de': ['Paddington Station', 'Paddington railway station', 'Bahnhof Paddington'], 'nl': ['Station Paddington', 'Paddington station', 'Station London Paddington'], 'pt': ['Estação de Paddington', 'London Paddington', 'Estação de London Paddington', 'Estação Paddington'], 'en': ['Paddington station', 'Paddington Railway station', ""Paddington (Bishop's Road) station"", 'Paddington', 'London Paddington', 'Paddington Bear Station', 'London Paddington station'], 'ro': ['Londra Paddington', 'gara Paddington', 'gara Londra Paddington'], 'cy': ['Gorsaf Paddington Llundain', 'Gorsaf reilffordd Paddington Llundain'], 'tr': ['Paddington Tren İstasyonu', ""Paddington (Bishop's Road) Tren İstasyonu"", 'Londra Paddington Tren İstasyonu'], 'pl': ['Paddington station'], 'uk': ['Паддінгтон'], 'en-ca': ['London Paddington station'], 'en-gb': ['London Paddington station']}",,['Q19186'],"{'Q145': ('', '')}",51.516667,-0.177222 +Q1488404,London Docks,['Q811979'],"{'de': ['London Docks'], 'en': ['London Docks'], 'fr': ['Docks de Londres']}",,['Q19186'],"{'Q145': ('', '')}",51.506,-0.060333 +Q14946379,Diocese of London,['Q18917976'],"{'it': ['diocesi di Londra', 'diocesi anglicana di Londra'], 'es': ['diócesis de Londres', 'diócesis anglicana de Londres'], 'en': ['Diocese of London'], 'en-ca': ['Diocese of London'], 'en-gb': ['Diocese of London'], 'de': ['Diözese London'], 'fr': ['diocèse de Londres'], 'pt': ['Diocese de Londres'], 'pl': ['Diecezja londyńska'], 'cy': ['Esgobaeth Llundain'], 'nl': ['Bisdom Londen']}",,[],"{'Q145': ('', '')}",51.5138,-0.0986 +Q23311,City of London,"['Q515', 'Q1066984', 'Q738570', 'Q180673', 'Q17601336', 'Q21503295', 'Q7897276']","{'en': ['the City', 'Square Mile', 'City and County of the City of London', 'City of London (unparished area)', 'London', 'City of London'], 'fr': ['La City', 'City of London', 'Cité de Londres', 'cité de Londres'], 'de': ['London', 'City of London'], 'it': ['La City', 'City of London', 'City di Londra', 'Città di Londra'], 'en-gb': ['the City', 'City and County of the City of London', 'City of London'], 'cy': ['y Filltir Sgwâr', 'Dinas Llundain'], 'en-ca': ['City of London'], 'es': ['City de Londres'], 'ga': ['Cathair Londan'], 'nl': ['City of London'], 'pl': ['City of London'], 'pt': ['Cidade de Londres'], 'ro': ['City of London'], 'sco': ['Ceety o Lunnon'], 'tr': ['Londra Şehri'], 'uk': ['Лондонське Сіті']}",['City of London'],['Q19186'],"{'Q145': ('', '')}",51.515556,-0.093056 +Q6900329,The Blitz,['Q2380335'],"{'en': ['Blitz', 'London Blitz', 'The Blitz'], 'cy': ['Y Blitz'], 'es': ['Blitz'], 'pt': ['Blitz'], 'tr': ['The Blitz'], 'ro': ['The Blitz'], 'pl': ['Blitz'], 'fr': ['Blitz'], 'de': ['The Blitz'], 'en-ca': ['The Blitz'], 'en-gb': ['The Blitz'], 'ga': ['An Bhleaist'], 'sco': ['The Blitz'], 'nl': ['The Blitz'], 'it': ['The Blitz'], 'uk': ['Бліц']}",,[],"{'Q145': ('', '')}",51.506944,-0.1275 +Q92561,London,"['Q6593035', 'Q14762300', 'Q515', 'Q1549591']","{'en': ['London, ON', 'London, Ontario', 'London'], 'fr': ['London, Ontario', 'London'], 'es': ['London (Ontario)', 'Londres (Ontario)', 'London'], 'en-gb': ['London'], 'de': ['London'], 'it': ['London'], 'nl': ['London'], 'pl': ['London'], 'pt': ['London'], 'uk': ['Лондон'], 'tr': ['London'], 'ga': ['Londain, Ontario'], 'cy': ['Llundain'], 'gd': ['Lunnainn'], 'ro': ['London']}",,[],"{'Q16': ('', '')}",42.9837,-81.2497 +Q171240,London Stock Exchange,['Q11691'],"{'pl': ['Londyńska Giełda Papierów Wartościowych', 'Giełda Papierów Wartościowych w Londynie', 'London Stock Exchange'], 'fr': ['London Stock Exchange', 'bourse de Londres'], 'es': ['London Stock Exchange', 'Bolsa de Valores de Londres', 'Bolsa de Londres'], 'it': ['Borsa di Londra', 'London Stock Exchange'], 'de': ['Londoner Börse', 'London Stock Exchange Group', 'London Stock Exchange'], 'nl': ['Londense beurs', 'London Stock Exchange'], 'pt': ['Bolsa de Londres', 'London stock exchange', 'Bolsa de Valores de Londres'], 'tr': ['Londra Menkul Değerler Borsası', 'London Stock Exchange', 'Londra Borsası'], 'ro': ['Bursa de Valori din Londra', 'Bursa de la Londra', 'Bursa londoneză', 'Bursa din Londra', 'London Stock Exchange'], 'en': ['London Stock Exchange, LSE', 'London Stock Exchange'], 'uk': ['Лондонська фондова біржа'], 'cy': ['Cyfnewidfa Stoc Llundain'], 'sco': ['Lunnon Stock Exchynge'], 'en-ca': ['London Stock Exchange'], 'en-gb': ['London Stock Exchange']}",,['Q19186'],"{'Q145': ('', '')}",51.515065,-0.098972 +Q279459,Victoria Coach Station,['Q494829'],"{'fr': ['Gare routière Victoria', 'Victoria Coach Station'], 'de': ['Victoria Coach Station'], 'en': ['Victoria Coach Station'], 'es': ['Victoria Coach Station'], 'it': ['autostazione di Victoria'], 'pl': ['Victoria Coach Station'], 'nl': ['Victoria Coach Station'], 'en-gb': ['Victoria Coach Station'], 'pt': ['Estação rodoviária Victoria']}",,['Q19186'],"{'Q145': ('', '')}",51.49316,-0.14864 +Q795691,London Waterloo station,"['Q18543139', 'Q55485']","{'es': ['Waterloo station', 'Estacion de Waterloo', 'Estacion Waterloo', 'London Waterloo', 'Estación Waterloo', 'London Waterloo station', 'Estación de Waterloo'], 'nl': ['Station London Waterloo Station', 'Waterloo Station', 'London Waterloo', 'Station London Waterloo'], 'en': ['Waterloo station', 'Waterloo railway station', 'Waterloo train station', 'London Waterloo station'], 'de': ['Waterloo Bridge', 'Waterloo'], 'it': ['stazione di Waterloo', 'stazione di Londra Waterloo'], 'pt': ['Estação Waterloo'], 'uk': ['Ватерлоо'], 'en-ca': ['London Waterloo station'], 'en-gb': ['London Waterloo station'], 'pl': ['Waterloo Station'], 'cy': ['Gorsaf Waterloo Llundain'], 'fr': ['gare de Londres-Waterloo'], 'ga': ['Stáisiún Londain Waterloo'], 'tr': ['Waterloo İstasyonu']}",,['Q67443130'],"{'Q145': ('', '')}",51.5031,-0.1132 +Q734547,North London,['Q7631958'],"{'en': ['London/North', 'North London'], 'nl': ['Londen/North', 'Noord-Londen'], 'fr': ['North London'], 'it': ['North London'], 'pt': ['North London'], 'en-ca': ['North London'], 'en-gb': ['North London'], 'cy': ['Gogledd Lundain'], 'es': ['Norte de Londres'], 'ga': ['Londain Thuaidh']}",,['Q67479626'],"{'Q145': ('', '')}",51.54962,-0.167614 +Q2477346,London,['Q486972'],"{'fr': ['Londres', 'Ronton', 'London'], 'es': ['Londres', 'Ronton', 'London'], 'it': ['Ronton', 'London'], 'en': ['Ronton', 'London'], 'pt': ['Ronton', 'London'], 'pl': ['London'], 'nl': ['London'], 'de': ['London']}",['London'],[],"{'Q710': ('', '')}",1.983333,-157.475 +Q3061911,London,['Q1093829'],"{'en': ['London, Kentucky', 'London, KY', 'London'], 'tr': ['London, Kentucky', 'London'], 'es': ['Londres, Kentucky', 'London'], 'nl': ['London'], 'pl': ['London'], 'pt': ['London'], 'de': ['London'], 'fr': ['London'], 'it': ['London'], 'cy': ['London'], 'uk': ['Лондон'], 'ga': ['London']}",,[],"{'Q30': ('', '')}",37.1275,-84.0842 +Q8577,2012 Summer Olympics,['Q159821'],"{'en': ['London 2012', 'Games of the XXX Olympiad', '2012 Olympics', '2012 London Olympics', 'London Olympics', 'London Olympic Games', '2012 Summer Olympics'], 'it': ['Londra 2012', 'Giochi della XXX Olimpiade'], 'de': ['London2012', 'Olympia 2012', 'London 2012', 'Olympische Spiele 2012', 'Spiele der XXX. Olympiade', 'XXX. Olympische Sommerspiele', 'Olympische Sommerspiele 2012'], 'pt': ['Londres 2012', 'Jogos da XXX Olimpíada', 'Jogos Olímpicos de 2012', 'Jogos Olímpicos de Verão de 2012'], 'es': ['Londres 2012', 'Juegos de la XXX Olimpiada', 'Juegos Olímpicos de Londres 2012'], 'fr': ['Londres 2012', 'Jeux olympiques 2012', 'Jeux de la XXXe olympiade', ""Jeux olympiques d'été de 2012""], 'cy': ['Gemau Olympaidd yr Haf 2012'], 'en-ca': ['2012 Summer Olympics'], 'en-gb': ['2012 Summer Olympics'], 'ga': ['Cluichí Oilimpeacha an tSamhraidh 2012'], 'nl': ['Olympische Zomerspelen 2012'], 'pl': ['Letnie Igrzyska Olimpijskie 2012'], 'ro': ['Jocurile Olimpice de vară din 2012'], 'sco': ['2012 Simmer Olympics'], 'tr': ['2012 Yaz Olimpiyatları'], 'uk': ['Літні Олімпійські ігри 2012']}",,[],"{'Q145': ('', '')}",51.538611,-0.016389 +Q1137312,County of London,"['Q67376938', 'Q19953632', 'Q2560047', 'Q180673', 'Q21272231']","{'it': ['County of London', 'contea di Londra'], 'fr': ['comté de Londres'], 'en': ['County of London'], 'nl': ['Graafschap Londen'], 'pt': ['Condado de Londres'], 'de': ['County of London'], 'es': ['condado de Londres'], 'pl': ['County of London'], 'uk': ['Лондонське графство'], 'cy': ['Sir Llundain'], 'en-gb': ['County of London']}",,['Q19186'],"{'Q145': ('', '')}",51.5155,-0.0922 +Q6670323,"London District, Upper Canada",['Q19953632'],"{'en': ['London District, Upper Canada']}",,[],"{'Q16': ('', '')}",43.4,-81.2 +Q8691,Heathrow Airport,"['Q644371', 'Q94993988', 'Q43229']","{'en': ['Heathrow', 'London Heathrow Airport', 'London Heathrow', ""London's Heathrow Airport"", 'Heathrow Airport'], 'de': ['Heathrow', 'Flughafen London Heathrow'], 'it': ['Aeroporto di Heathrow', 'Heathrow', 'Aeroporto di Londra-Heathrow'], 'fr': ['Londres Heathrow', 'aéroport de Londres Heathrow'], 'nl': ['London Heathrow Airport', 'Londen Heathrow', 'Luchthaven Heathrow', 'Luchthaven Londen Heathrow'], 'cy': ['Maes Awyr Heathrow'], 'es': ['Aeropuerto de Londres-Heathrow'], 'ga': ['Aerfort Londain-Heathrow'], 'pl': ['Port lotniczy Londyn-Heathrow'], 'pt': ['Aeroporto de Londres Heathrow'], 'ro': ['Aeroportul Londra Heathrow'], 'tr': ['Heathrow Havalimanı'], 'uk': ['Хітроу']}",['London Heathrow Airport'],['Q19186'],"{'Q145': ('', '')}",51.4775,-0.461389 +Q1545354,Port of London,"['Q44782', 'Q863915', 'Q15310171']","{'de': ['Londoner Hafen'], 'en': ['Port of London'], 'es': ['Puerto de Londres'], 'fr': ['port de Londres'], 'it': ['porto di Londra'], 'nl': ['haven van Londen'], 'en-ca': ['Port of London'], 'en-gb': ['Port of London'], 'cy': ['Porthladd Llundain']}",['Port of London'],['Q67442940'],"{'Q145': ('', '')}",51.5,0.05 +Q338466,Anglo-Saxon London,,"{'en': ['Anglo-Saxon London'], 'it': ['Londra anglosassone']}",,[],"{'Q145': ('', '')}",51.5125,-0.1225 +Q1988417,Chinatown,"['Q202509', 'Q2755753', 'Q123705']","{'es': ['Chinatown de Londres', 'Barrio Chino de Londres', 'Chinatown'], 'nl': ['London Chinatown', 'Londen Chinatown', 'Chinatown'], 'en': ['London Chinatown', 'Chinatown, London', 'China Town', 'China Town, London', 'Chinatown'], 'fr': ['Chinatown'], 'uk': ['Чайна-таун'], 'it': ['Chinatown'], 'ga': ['Ceantar Síneach'], 'de': ['Chinatown'], 'en-gb': ['Chinatown']}",,['Q19186'],"{'Q145': ('', '')}",51.511111,-0.131389 +Q578794,London Marathon,"['Q40244', 'Q18608583']","{'es': ['Maraton de Londres', 'Maratón de Londres'], 'de': ['London-Marathon'], 'en': ['London Marathon'], 'fr': ['Marathon de Londres'], 'it': ['Maratona di Londra'], 'nl': ['Marathon van Londen'], 'pt': ['Maratona de Londres'], 'tr': ['Londra Maratonu'], 'cy': ['Marathon Llundain'], 'uk': ['Лондонський марафон'], 'pl': ['Maraton w Londynie'], 'en-ca': ['London Marathon'], 'en-gb': ['London Marathon']}",,[],"{'Q145': ('', '')}",51.472778,0.009444 +Q1415441,London Southend Airport,"['Q1248784', 'Q94993988']","{'de': ['Southend Airport', 'London Southend Airport'], 'en': ['Southend', 'London Southend Airport'], 'fr': ['aérodrome de Royaume Uni', 'Southend', 'aéroport de Londres Southend'], 'nl': ['London Southend Airport', 'Luchthaven London Southend'], 'pl': ['Port lotniczy Londyn-Southend'], 'it': ['Aeroporto di Londra-Southend'], 'es': ['Aeropuerto de Londres-Southend'], 'tr': ['Londra Southend Havalimanı'], 'uk': ['Лондон-Саутенд'], 'ro': ['Aeroportul Londra Southend']}",,['Q67442940'],"{'Q145': ('', '')}",51.570278,0.693333 +Q8111,1908 Summer Olympics,['Q159821'],"{'en': ['London 1908', 'Games of the IV Olympiad', '1908 Summer Olympics'], 'it': ['Londra 1908', 'Giochi della IV Olimpiade'], 'fr': ['Londres 1908', ""Jeux olympiques d'été de 1908""], 'es': ['Juegos de Londres 1908', 'Juegos Olímpicos de Londres 1908'], 'nl': ['Zomerspelen 1908', 'Olympische Zomerspelen 1908'], 'de': ['Olympische Sommerspiele 1908'], 'cy': ['Gemau Olympaidd yr Haf 1908'], 'en-ca': ['1908 Summer Olympics'], 'en-gb': ['1908 Summer Olympics'], 'pl': ['Letnie Igrzyska Olimpijskie 1908'], 'pt': ['Jogos Olímpicos de Verão de 1908'], 'ro': ['Jocurile Olimpice de vară din 1908'], 'tr': ['1908 Yaz Olimpiyatları'], 'uk': ['Літні Олімпійські ігри 1908'], 'ga': ['Cluichí Oilimpeacha an tSamhraidh 1908'], 'sco': ['1908 Simmer Olympics']}",,[],"{'Q174193': ('', '')}",51.51362,-0.2274 +Q6669759,London,['Q123705'],"{'en': ['London'], 'nl': ['London'], 'es': ['London']}",,[],"{'Q403': ('', '')}",44.808495,20.463161 +Q985210,London Victoria station,"['Q55488', 'Q55485', 'Q7886778']","{'en': ['Victoria station', 'Victoria railway station', 'Victoria Railway Station The Former London, Chatham And Dover Railway Station Including Train Shed', 'London Victoria station'], 'es': ['estación Victoria', 'London Victoria', 'estacion de Victoria', 'estacion Victoria', 'estación de Victoria'], 'ro': ['gara London Victoria', 'gara Victoria', 'gara Londra Victoria'], 'it': ['stazione di Victoria', 'stazione di Londra Victoria'], 'de': ['London Victoria Station'], 'cy': ['Gorsaf reilffordd Victoria Llundain'], 'fr': ['gare de Londres Victoria'], 'nl': ['Station London Victoria'], 'pl': ['Victoria Station'], 'pt': ['Estação Victoria'], 'en-ca': ['London Victoria station'], 'en-gb': ['London Victoria station'], 'uk': ['Лондон-Вікторія'], 'tr': ['Victoria İstasyonu']}",,['Q19186'],"{'Q145': ('', '')}",51.495005,-0.143577 +Q219867,London King's Cross railway station,"['Q55488', 'Q55485', 'Q22808404']","{'fr': ['gare de King’s Cross', ""gare de Londres-King's Cross""], 'es': [""Estacion de King's Cross St Pancras"", 'Estacion de Kings Cross', 'Estación de Kings Cross St. Pancras', 'King Cross', 'Kings Cross', 'Estacion de Kings Cross St. Pancras', ""Estación de King's Cross"", 'Estacion de Kings Cross St Pancras', ""King's Cross"", ""Estacion de King's Cross St. Pancras"", ""Estación de King's Cross St. Pancras"", 'Estación de Kings Cross'], 'it': [""Stazione di King's Cross"", ""stazione di Londra King's Cross""], 'de': [""King's Cross"", ""Bahnhof King's Cross"", 'King’s Cross', 'King’s Cross Station', ""King's Cross Station"", 'Bahnhof Kings Cross', 'London King’s Cross Station', 'Bahnhof King’s Cross'], 'nl': [""Station King's Cross"", 'Station King’s Cross', ""King's Cross Station"", ""Station London King's Cross""], 'pt': ['Plataforma Nove e Meia', ""Estação King's Cross"", 'Plataforma Nove e Três Quartos', 'King´s Cross', ""King's Cross"", ""London King's Cross"", ""Estação de King's Cross""], 'en': [""King's Cross Railway Station"", ""King's Cross station"", ""London King's Cross railway station""], 'pl': [""King's Cross Station""], 'uk': ['Кінгс-Кросс'], 'cy': [""Gorsaf reilffordd King's Cross Llundain""], 'ro': [""gara King's Cross""], 'tr': [""King's Cross Tren İstasyonu""], 'en-ca': [""London King's Cross railway station""], 'en-gb': [""London King's Cross railway station""]}","[""King's Cross station""]",['Q19186'],"{'Q145': ('', '')}",51.530889,-0.123306 +Q795678,Waterloo International railway station,"['Q55488', 'Q55485']","{'en': ['Waterloo International station', 'Waterloo International railway station'], 'de': ['London Waterloo Station'], 'nl': ['station Waterloo International']}",,['Q67443130'],"{'Q145': ('', '')}",51.502972,-0.114808 +Q7242790,Pride London,"['Q51404', 'Q11483816', 'Q132241']","{'en': ['Pride in London', 'Pride London'], 'tr': ['Pride London'], 'it': ['Pride London'], 'uk': ['Лондонський прайд'], 'es': ['Orgullo de Londres']}",,['Q19186'],"{'Q145': ('', '')}",51.518334,-0.14401 +Q216185,Charing Cross,"['Q4989906', 'Q2755753']","{'pl': ['Charing Cross'], 'fr': ['Charing Cross'], 'es': ['Charing Cross'], 'it': ['Charing Cross'], 'de': ['Charing Cross'], 'en': ['Charing Cross'], 'nl': ['Charing Cross'], 'pt': ['Charing Cross'], 'cy': ['Charing Cross'], 'en-ca': ['Charing Cross'], 'en-gb': ['Charing Cross'], 'ga': ['Charing Cross'], 'uk': ['Чарінг-Кросс'], 'tr': ['Charing Cross'], 'sco': ['Charing Cross']}",,['Q19186'],"{'Q145': ('', '')}",51.5073,-0.12755 +Q2018322,Old Compton Street,['Q79007'],"{'cy': ['Old Compton Street'], 'en': ['Old Compton Street'], 'de': ['Old Compton Street'], 'nl': ['Old Compton Street'], 'en-ca': ['Old Compton Street'], 'en-gb': ['Old Compton Street'], 'fr': ['Old Compton Street'], 'uk': ['Олд Комптон стріт']}",,['Q19186'],"{'Q145': ('', '')}",51.51326,-0.13128 +Q720102,St Pancras railway station,"['Q55488', 'Q2298537', 'Q55485', 'Q1402443']","{'pl': ['St Pancras Station', 'St Pancras International'], 'fr': ['gare de Londres-Saint-Pancras', 'gare de Saint-Pancras', 'St Pancras International'], 'es': ['Estacion de St Pancras', 'Estacion de St. Pancras', 'St Prancas', 'Estación de St Pancras', 'Estación de St. Pancras'], 'it': ['stazione di St Pancras', 'stazione di Londra St. Pancras', 'stazione di St. Pancras', 'stazione di Londra Saint Pancras', 'stazione di Saint Pancras', 'stazione di Londra St Pancras'], 'cy': ['St Pancras', 'Gorsaf reilffordd St Pancras', 'Gorsaf reilffordd St Pancras Llundain'], 'de': ['Bahnhof St. Pancras', 'London St Pancras', 'St Pancras Station', 'Bahnhof St Pancras'], 'en': ['St Pancras station', 'London St Pancras', 'St Pancras International', 'London St Pancras International', 'St Pancras railway station'], 'nl': ['St. Pancras International', 'St Pancras International', 'Station London St. Pancras', 'Station London St Pancras', 'station London St Pancras International'], 'uk': ['Сент-Панкрас'], 'tr': ['St Pancras Uluslararası Tren İstasyonu'], 'ro': ['gara St Pancras'], 'pt': ['Estação St Pancras']}",['St Pancras railway station'],['Q19186'],"{'Q145': ('', '')}",51.53,-0.125278 +Q23306,Greater London,['Q180673'],"{'de': ['Groß-London', 'Großraum London', 'Greater London'], 'pt': ['Região de Londres', 'Grande Londres'], 'en': ['London Region', 'Greater London'], 'en-gb': ['Greater London'], 'en-ca': ['Greater London'], 'cy': ['Llundain Fawr'], 'es': ['Gran Londres'], 'fr': ['Grand Londres'], 'ga': ['Londain Mhór'], 'it': ['Grande Londra'], 'nl': ['Groot-Londen'], 'pl': ['Wielki Londyn'], 'ro': ['Londra Mare'], 'sco': ['Greater Lunnon'], 'uk': ['Великий Лондон'], 'tr': ['Büyük Londra']}",['Greater London'],[],"{'Q145': ('', '')}",51.5,-0.083333 +Q42182,Buckingham Palace,"['Q53536964', 'Q2087181', 'Q570116', 'Q7328910', 'Q16884952']","{'pl': ['Buckingham Palace', 'Pałac Buckingham'], 'es': ['Buckingham Palace', 'Palacio de Buckingham'], 'pt': ['Buckingham House', 'Buckingham Palace', 'Palácio de Buckingham'], 'fr': ['Buckingham Palace', 'palais de Buckingham'], 'it': ['Buckingham House', 'Bukingam palace', 'Palazzo Di Buckingham', 'Buckingham Palace', 'Palazzo di Buckingham'], 'de': ['Buckinghampalast', 'Buckingham Palast', 'Buckingham-Palast', 'Buckingham Palace'], 'cy': ['Buckingham Palace', 'Palas Bycingam', 'Palas Buckingham'], 'en': ['Buckingham House', 'Buck House', 'Buckingham Palace'], 'ro': ['Casa de Buckingham', 'Buck House', 'Buckingham House', 'Palatul Buckingham'], 'ga': ['Pálás Buckingham'], 'nl': ['Buckingham Palace'], 'tr': ['Buckingham Sarayı'], 'uk': ['Букінгемський палац'], 'en-ca': ['Buckingham Palace'], 'en-gb': ['Buckingham Palace'], 'sco': ['Buckingham Palace'], 'gd': ['Lùchairt Buckingham']}",['Buckingham Palace'],['Q19186'],"{'Q145': ('', '')}",51.501,-0.142 +Q1449564,London station (Ontario),['Q55488'],"{'en': ['London, Ontario railway station', 'London station', 'London railway station', 'VIA Rail London station', 'London station (Ontario)'], 'fr': ['gare de London', 'London'], 'it': ['stazione di London'], 'es': ['Estación de London']}",,[],"{'Q16': ('', '')}",42.9813,-81.2467 +Q733210,basketball at the 1948 Summer Olympics,['Q26132862'],"{'fr': ['Basket-ball aux jeux Olympiques de 1948', ""Basket-ball aux jeux Olympiques d'été 1948"", ""Basket-ball aux jeux Olympiques d'ete de 1948"", ""basket-ball aux Jeux olympiques d'été de 1948""], 'ro': ['Baschet la Jocurile Olimpice din 1948', 'Baschet la Jocurile Olimpice de vară din 1948'], 'nl': ['Olympische Zomerspelen 1948/Basketbal', 'basketbal op de Olympische Zomerspelen 1948'], 'pt': ['Basquetebol nos Jogos Olímpicos de Verão de 1948'], 'pl': ['Koszykówka na Letnich Igrzyskach Olimpijskich 1948'], 'en': ['basketball at the 1948 Summer Olympics'], 'es': ['Anexo:Baloncesto en los Juegos Olímpicos de Londres 1948'], 'tr': [""1948 Yaz Olimpiyatları'nda basketbol""], 'it': ['Pallacanestro ai Giochi della XIV Olimpiade'], 'de': ['Olympische Sommerspiele 1948/Basketball'], 'ga': ['cispheil ag Cluichí Oilimpeacha an tSamhraidh 1948']}",,['Q19186'],"{'Q145': ('', '')}",51.576256,-0.097697 +Q14710970,London,['Q17343829'],"{'en': ['London, Texas', 'London, TX', 'London'], 'fr': ['London'], 'it': ['London']}",,[],"{'Q30': ('', '')}",30.6769,-99.5764 +Q2422792,London commuter belt,['Q1907114'],"{'fr': ['London commuter belt', 'Aire métropolitaine de Londres', 'Commuter Belt', 'Aire urbaine de Londres'], 'es': ['London commuter belt', 'Area metropolitana sureste de Inglaterra', 'Area metropolitana de Londres', 'Área metropolitana sureste de Inglaterra', 'Área metropolitana de Londres'], 'en': ['London Metropolitan Region', 'London Metropolitan Area', 'London commuter belt'], 'nl': ['Metropoolregio van Londen', 'London commuter belt'], 'pt': ['Área metropolitana de Londres'], 'sco': ['Lunnon commuter belt'], 'it': ['area metropolitana di Londra']}",,['Q19186'],"{'Q145': ('', '')}",51.5073,-0.1277 +Q1001456,London,"['Q1093829', 'Q62049']","{'tr': ['London, Ohio', 'London'], 'en': ['London, Ohio', 'London, OH', 'London'], 'es': ['London (Ohio)', 'London'], 'pt': ['London'], 'pl': ['London'], 'fr': ['London'], 'de': ['London'], 'nl': ['London'], 'en-ca': ['London, Ohio'], 'en-gb': ['London'], 'it': ['London'], 'uk': ['Лондон'], 'cy': ['London, Ohio'], 'ga': ['London']}",,[],"{'Q30': ('', '')}",39.887466,-83.445041 +Q503516,Laurel County,['Q13410447'],"{'en': ['Laurel County, Kentucky', 'Laurel County, KY', 'Laurel County'], 'cy': ['Laurel County, Kentucky', 'Laurel County'], 'pt': ['Condado de Laurel'], 'pl': ['Hrabstwo Laurel'], 'fr': ['comté de Laurel'], 'es': ['Condado de Laurel'], 'uk': ['Лорел'], 'it': ['contea di Laurel'], 'de': ['Laurel County'], 'nl': ['Laurel County'], 'gd': ['Laurel County'], 'ro': ['Comitatul Laurel, Kentucky'], 'tr': ['Laurel County'], 'ga': ['Contae Laurel']}",['Laurel County'],[],"{'Q30': ('', '')}",37.11067,-84.1178 +Q8982,London City Airport,"['Q644371', 'Q1248784', 'Q94993988']","{'de': ['London City Airport', 'Flughafen London City'], 'fr': ['aérodrome de Royaume Uni', 'aéroport de Londres City'], 'nl': ['London City Airport', 'Luchthaven Londen City'], 'en': ['London City Airport'], 'it': ['aeroporto di Londra-City'], 'es': ['Aeropuerto de la Ciudad de Londres'], 'pl': ['Port lotniczy Londyn-City'], 'pt': ['Aeroporto da Cidade de Londres'], 'ro': ['Aeroportul London City'], 'sco': ['London City Airport'], 'tr': ['Londra Şehir Havalimanı'], 'uk': ['Лондон-Сіті'], 'cy': ['Maes Awyr Dinas Llundain']}",,['Q67442940'],"{'Q145': ('', '')}",51.505278,0.055278 +Q22059065,London,['Q17343829'],"{'en': ['London, Indiana', 'London, IN', 'London']}",,[],"{'Q30': ('', '')}",39.625556,-85.920278 +Q8712,London Luton Airport,"['Q644371', 'Q94993988']","{'en': ['Luton', 'Luton Airport', 'London Luton Airport'], 'fr': ['Londres-Luton', 'aérodrome de Royaume Uni', 'Luton', 'aéroport de Londres Luton'], 'it': ['Londra-Luton', 'Aeroporto di Londra-Luton'], 'uk': ['Лондонський аеропорт Лутон', 'Лутон'], 'de': ['Flughafen Luton', 'London Luton Airport'], 'nl': ['London Luton Airport', 'Luchthaven Londen Luton'], 'en-gb': ['London Luton Airport'], 'es': ['Aeropuerto de Londres-Luton'], 'pl': ['Port lotniczy Londyn-Luton'], 'pt': ['Aeroporto de Londres Luton'], 'tr': ['Londra Luton Havalimanı'], 'ro': ['Aeroportul Luton'], 'cy': ['Maes Awyr Luton']}",,['Q67387552'],"{'Q145': ('', '')}",51.874722,-0.368333 +Q20657974,London,['Q17343829'],"{'en': ['London, Minnesota', 'London, MN', 'London'], 'es': ['Londres, Minnesota', 'London, Minnesota', 'London']}",,[],"{'Q30': ('', '')}",43.526111,-93.062778 +Q565521,Clarence House,"['Q53536964', 'Q1802963']","{'ro': ['Clarence House', 'Casa Clarence'], 'uk': ['Кларенс-хаус', 'Кларенс-гаус'], 'cy': ['Tŷ Clarence', 'Clarence House'], 'pt': ['Clarence House'], 'pl': ['Clarence House'], 'fr': ['Clarence House'], 'en': ['Clarence House'], 'es': ['Clarence House'], 'it': ['Clarence House'], 'de': ['Clarence House'], 'nl': ['Clarence House'], 'tr': ['Clarence House'], 'en-ca': ['Clarence House'], 'en-gb': ['Clarence House'], 'ga': ['Teach Clarence']}",['Clarence House'],['Q19186'],"{'Q145': ('', '')}",51.504,-0.1385 +Q238587,National Portrait Gallery,"['Q207694', 'Q17431399', 'Q3343298']","{'en': ['Great Britain National Portrait Gallery', 'London National Portrait Gallery', 'National Portrait Gallery (London)', 'National Portrait Gallery London', 'NPG London', 'National Portrait Gallery'], 'cy': ['Oriel Bortreadau Genedlaethol', 'Galeri Genedlaethol o Bortreadau', 'yr Oriel Bortreadau Genedlaethol'], 'de': ['National Portrait Gallery'], 'es': ['National Portrait Gallery'], 'fr': ['National Portrait Gallery'], 'it': ['National Portrait Gallery'], 'nl': ['National Portrait Gallery'], 'pl': ['National Portrait Gallery'], 'pt': ['National Portrait Gallery'], 'ro': ['National Portrait Gallery'], 'uk': ['Національна портретна галерея (Лондон)'], 'ga': ['Gailearaí na bPortráidí Náisiúnta'], 'en-gb': ['National Portrait Gallery'], 'tr': ['Ulusal Portre Galerisi']}",['National Portrait Gallery'],['Q19186'],"{'Q145': ('', '')}",51.5094,-0.1281 +Q2716505,Stamford Hill,['Q2755753'],"{'nl': ['Stamford Hill'], 'en': ['Stamford Hill'], 'fr': ['Stamford Hill'], 'en-gb': ['Stamford Hill'], 'de': ['Stamford Hill'], 'ga': ['Stamford Hill'], 'it': ['Stamford Hill']}",['Stamford Hill'],['Q19186'],"{'Q145': ('', '')}",51.5705,-0.0727 +Q927198,Londinium,['Q2202509'],"{'fr': ['Londres romain', 'Londinium'], 'en': ['Roman London', 'Londinium'], 'uk': ['Лондиніум', 'Лондініум', 'Лондиній'], 'pt': ['Londinium'], 'pl': ['Londinium'], 'es': ['Londinium'], 'de': ['Londinium'], 'nl': ['Londinium'], 'it': ['Londinium'], 'en-ca': ['Londinium'], 'en-gb': ['Londinium'], 'ro': ['Londinium'], 'tr': ['Londinium'], 'cy': ['Londinium']}",,['Q19186'],"{'Q145': ('', '')}",51.514217,-0.088455 +Q122744,Maida Vale,['Q2755753'],"{'fr': ['Maida Vale'], 'en': ['Maida Vale'], 'ga': ['Maida Vale'], 'nl': ['Maida Vale'], 'it': ['Maida Vale'], 'sco': ['Maida Vale'], 'es': ['Maida Vale'], 'pt': ['Maida Vale'], 'pl': ['Maida Vale'], 'uk': ['Мейда-Вейл'], 'ro': ['Maida Vale'], 'de': ['Maida Vale'], 'en-ca': ['Maida Vale'], 'en-gb': ['Maida Vale']}",,['Q19186'],"{'Q145': ('1922', ''), 'Q174193': ('1801', '1922')}",51.5274,-0.1899 +Q123738,Hyde Park,['Q22698'],"{'es': ['Hyde Park Londres', 'Hyde Park'], 'it': ['Hyde Park di Londra', 'Hyde Park'], 'en': ['Hyde Park, London', 'Hyde Park'], 'pl': ['Hyde Park'], 'fr': ['Hyde Park'], 'de': ['Hyde Park'], 'ga': ['Hyde Park'], 'nl': ['Hyde Park'], 'pt': ['Hyde Park'], 'tr': ['Hyde Park'], 'ro': ['Hyde Park'], 'uk': ['Гайд-парк'], 'en-ca': ['Hyde Park, London'], 'en-gb': ['Hyde Park'], 'cy': ['Hyde Park']}",['Hyde Park'],['Q19186'],"{'Q145': ('', '')}",51.508611,-0.163611 +Q8703,London Gatwick Airport,"['Q644371', 'Q94993988']","{'en': ['Gatwick', 'Gatwick Airport', 'London Gatwick', 'London Gatwick Airport'], 'de': ['Flughafen London-Gatwick', 'Flughafen Gatwick', 'Flughafen London Gatwick'], 'it': ['Aeroporto di Gatwick', 'Gatwick', 'Londra-Gatwick', 'Aeroporto di Londra-Gatwick'], 'ro': ['Londra Gatwick', 'aeroportul Londra Gatwick', 'Aeroportul Londra Gatwick'], 'fr': ['Londres-Gatwick', 'aéroport de Londres Gatwick'], 'nl': ['London Gatwick Airport', 'Luchthaven Londen Gatwick'], 'es': ['Londres-Gatwick', 'Aeropuerto de Londres-Gatwick'], 'cy': ['Maes Awyr Gatwick'], 'ga': ['Aerfort Londain-Gatwick'], 'pl': ['Port lotniczy Londyn-Gatwick'], 'pt': ['Aeroporto de Londres Gatwick'], 'uk': ['Аеропорт Гатвік'], 'tr': ['Londra Gatwick Havalimanı'], 'gd': ['Port-adhair Gatwick']}",,['Q67443130'],"{'Q145': ('', '')}",51.147222,-0.190278 +Q15179170,Alexandra Palace transmitting station,['Q5367899'],"{'en-gb': ['Alexandra Palace television station', 'Alexandra Palace transmitting station'], 'en': ['Alexandra Palace television station', 'Alexandra Palace transmitting station']}",,['Q19186'],"{'Q145': ('', '')}",51.594444,-0.129167 +Q10818,7 July 2005 London bombings,['Q217327'],"{'en': ['7/7 London Bombings', '7 July London Bombings', 'July 7, 2005 London Bombings', 'July 7 London Bombings', 'Coordinated terrorist attack hits London', '7 July 2005 London bombings'], 'en-gb': ['7/7 London Bombings', '7 July London Bombings', '7 July 2005 London bombings'], 'fr': ['Les transports londoniens touchés par des attentats', 'attentats du 7 juillet 2005 à Londres'], 'pt': ['Londres é vítima de ataque terrorista', 'Atentados de 7 de julho de 2005 em Londres'], 'de': ['Terroranschläge in London', 'Terroranschläge am 7. Juli 2005 in London'], 'it': ['attentato di Londra del 7 luglio 2005', 'attentati di Londra del 7 luglio 2005'], 'cy': ['Ffrwydradau Llundain 7 Gorffennaf 2005'], 'en-ca': ['7 July 2005 London bombings'], 'es': ['atentados del 7 de julio de 2005 en Londres'], 'nl': ['terroristische aanslagen in Londen van 7 juli 2005'], 'pl': ['Zamach w Londynie'], 'ro': ['Atentatele din 7 iulie 2005 de la Londra'], 'uk': ['Вибухи у Лондоні 7 липня 2005'], 'tr': ['7 Temmuz 2005 Londra saldırıları'], 'ga': ['7 Iúil 2005 Buamáil Londan']}",,['Q19186'],"{'Q145': ('', '')}",51.504872,-0.07857 +Q1359589,West End theatre,['Q11635'],"{'es': ['Teatro del West End', 'Teatro de West End', 'Teatros de West End', 'Teatros del West End'], 'en': ['West End', 'West End theatre'], 'pl': ['West End'], 'fr': ['West End theatre'], 'it': ['Teatro del West End'], 'en-ca': ['West End theatre'], 'en-gb': ['West End theatre'], 'de': ['West End Theatre'], 'ro': ['West End'], 'tr': ['West End tiyatrosu']}",,['Q19186'],"{'Q145': ('', '')}",51.511389,-0.128056 +Q649419,Marylebone station,"['Q55488', 'Q55485']","{'en': ['London Marylebone', 'Marylebone station'], 'ro': ['Londra Marylebone', 'gara Londra Marylebone', 'gara Marylebone'], 'it': ['stazione di Marylebone', 'stazione di Londra Marylebone'], 'pt': ['London Marylebone', 'Estação Marylebone'], 'fr': ['gare de Marylebone', 'London Marylebone'], 'cy': ['Gorsaf reilffordd Marylebone Llundain'], 'de': ['Bahnhof Marylebone'], 'nl': ['Station London Marylebone'], 'pl': ['Marylebone'], 'tr': ['Marylebone İstasyonu'], 'es': ['Estación de Marylebone'], 'uk': ['Мерілебон (станція)']}",,['Q19186'],"{'Q145': ('', '')}",51.522222,-0.163056 +Q15242653,London Museum,['Q33506'],"{'en': ['London Museum'], 'it': ['London Museum'], 'de': ['London Museum']}",,['Q19186'],"{'Q145': ('', '')}",51.5052,-0.188 +Q20075,London Underground,"['Q5503', 'Q1268865']","{'en-gb': ['the Underground', 'the Tube', 'London Underground'], 'en': ['the Underground', 'the Tube', 'London Underground Limited', 'London Underground'], 'es': ['London Underground', 'Underground', 'Tube', 'Metro de Londres'], 'pl': ['London Underground', 'the Tube', 'the Underground', 'Metro w Londynie'], 'cy': ['Underground Llundain', 'Rheilffordd Danddaearol Llundain'], 'it': ['the Tube', 'London Underground', 'metropolitana di Londra'], 'nl': ['London Underground', 'metro van Londen'], 'de': ['London Underground'], 'fr': ['métro de Londres'], 'ga': ['London Underground'], 'pt': ['Metropolitano de Londres'], 'ro': ['Metroul din Londra'], 'tr': ['Londra metrosu'], 'uk': ['Лондонський метрополітен'], 'en-ca': ['London Underground']}",,"['Q67443130', 'Q19186', 'Q67285329', 'Q67442940', 'Q67532100']","{'Q145': ('', '')}",51.492778,-0.100833 +Q6669738,London,['Q17343829'],"{'en': ['London, Wisconsin', 'London, WI', 'London'], 'es': ['Londres, Wisconsin', 'London (Wisconsin)'], 'fr': ['London']}",,[],"{'Q30': ('', '')}",43.0478,-89.0128 +Q756819,Strand,['Q79007'],"{'nl': ['The Strand', 'Strand'], 'en': ['the Strand', 'Strand, London', 'Strand'], 'pt': ['Strand'], 'fr': ['The Strand'], 'es': ['Strand'], 'uk': ['Стренд'], 'it': ['Strand'], 'cy': ['Y Strand'], 'de': ['Strand'], 'en-ca': ['Strand'], 'en-gb': ['Strand'], 'ga': ['Strand'], 'tr': ['Strand, Londra'], 'pl': ['Strand (Londyn)']}",['Strand'],['Q19186'],"{'Q145': ('', '')}",51.5114,-0.119 +Q2354215,Central London,['Q82794'],"{'fr': ['Central London'], 'en': ['Central London'], 'it': ['Central London'], 'en-ca': ['Central London'], 'en-gb': ['Central London'], 'nl': ['Centraal Londen'], 'de': ['Central London'], 'es': ['Centro de Londres'], 'cy': ['canol Llundain'], 'pt': ['Centro de Londres'], 'uk': ['Центральний Лондон']}",,['Q67479626'],"{'Q145': ('', '')}",51.5073,0.12755 +Q7443327,Second Great Fire of London,"['Q2380335', 'Q838718']","{'en': ['Second Great Fire of London'], 'uk': ['Друга велика лондонська пожежа']}",,[],"{'Q145': ('', '')}",51.5157,-0.0921 +Q123885,Royal Society,"['Q414147', 'Q2085381', 'Q45400320', 'Q955824', 'Q1966910']","{'pl': ['Towarzystwo Królewskie', 'The Royal Society of London for Improving Natural Knowledge', 'The Royal Society', 'Royal Society'], 'nl': ['Royal Society of London', 'Royal Society of London for the Improvement of Natural Knowledge', 'Royal Society'], 'pt': ['Royal Society of London', 'Real Sociedade de Londres', 'The Royal Society', 'Sociedade Real de Londres', 'Royal Society'], 'tr': ['The Royal Society of London for the Improvement of Natural Knowledge', 'Kraliyet Cemiyeti', 'Royal Society of London for the Improvement of Natural Knowledge', 'Royal Society'], 'uk': ['Британське королівське товариство', 'Лондонського королівського товариства', 'Королівське товариство', 'Королівське наукове товариство', 'Лондонське королівське наукове товариство', 'Лондонське королівське товариство'], 'it': ['Fellow of the Royal Society', 'Royal Society'], 'ro': ['Fellow of the Royal Society', 'Societatea Regală din Londra'], 'en': ['The Royal Society of London for Improving Natural Knowledge', 'Royal Society of London', 'The President, Council, and Fellows of the Royal Society of London for Improving Natural Knowledge', 'The Royal Society, UK', 'Royal Society'], 'ga': ['Cumann Ríoga', 'An Cumann Ríoga'], 'es': ['Royal Society'], 'de': ['Royal Society'], 'cy': ['y Gymdeithas Frenhinol'], 'fr': ['Royal Society'], 'en-gb': ['Royal Society'], 'gd': ['An Comann Rìoghail'], 'kw': ['Kowethas Riel']}",,['Q19186'],"{'Q145': ('', '')}",51.506111,-0.132222 +Q55018,Royal Opera House,"['Q153562', 'Q24354', 'Q3469910']","{'en': ['Covent Garden', 'Royal Italian Opera', 'ROH Covent Garden', 'Covent Garden Opera', 'Royal Opera House, Covent Garden', 'Royal Opera House'], 'de': ['The Royal Opera House', 'Royal Opera House'], 'pl': ['Royal Opera House', 'English National Opera', 'Covent Garden Theatre'], 'nl': ['Royal Opera House', 'Royal Opera House Covent Garden'], 'en-ca': ['Royal Opera House'], 'en-gb': ['Royal Opera House'], 'es': ['Royal Opera House'], 'fr': ['Royal Opera House'], 'it': ['Royal Opera House'], 'pt': ['Royal Opera House'], 'tr': ['Royal Opera House'], 'uk': ['Королівський театр Ковент-Гарден'], 'cy': ['Tŷ Opera Brenhinol'], 'ro': ['Royal Opera House']}",['Royal Opera House'],['Q19186'],"{'Q145': ('', '')}",51.513056,-0.1225 +Q130206,London Bridge,"['Q537127', 'Q3397519', 'Q158438', 'Q1735471', 'Q1223230']","{'en': ['London Bridge'], 'en-ca': ['London Bridge'], 'en-gb': ['London Bridge'], 'de': ['London Bridge'], 'es': ['London Bridge'], 'fr': ['pont de Londres'], 'ga': ['Droichead Londan'], 'it': ['London Bridge'], 'nl': ['London Bridge'], 'pl': ['London Bridge'], 'pt': ['Ponte de Londres'], 'ro': ['Podul Londrei'], 'tr': ['Londra Köprüsü'], 'uk': ['Лондонський міст'], 'sco': ['London Bridge'], 'cy': ['Pont Llundain']}",,"['Q19186', 'Q67443130']","{'Q21': ('', '')}",51.508056,-0.087778 +Q4642035,64 Baker Street,['Q41176'],{'en': ['64 Baker Street']},,['Q19186'],"{'Q145': ('', '')}",51.5191,-0.156 +Q729177,Cleopatra's Needle,"['Q170980', 'Q570116']","{'en': [""Cleopatra's Needle, London"", ""Cleopatra's Needle""], 'fr': ['Aiguille de Cléopâtre'], 'de': ['Nadeln der Kleopatra'], 'es': ['Agujas de Cleopatra'], 'it': ['Ago di Cleopatra'], 'nl': ['Naald van Cleopatra'], 'pt': ['Agulhas de Cleópatra'], 'uk': ['Голка Клеопатри (Лондон)'], 'pl': ['Igła Kleopatry']}","[""Cleopatra's Needle""]",['Q19186'],"{'Q145': ('', '')}",51.508503,-0.120296 +Q1399178,Fazl Mosque,['Q32815'],"{'en': ['The London Mosque', 'Fazl Mosque'], 'de': ['Fazl-Moschee'], 'es': ['Mezquita Fazl'], 'fr': ['mosquée Fazl'], 'en-gb': ['Fazl Mosque']}",,['Q67443130'],"{'Q145': ('', '')}",51.4511,-0.2075 +Q5645763,Hammersmith bus station,['Q494829'],{'en': ['Hammersmith bus station']},,['Q19186'],"{'Q145': ('', '')}",51.4921,-0.224 +Q194209,basketball at the 2012 Summer Olympics,['Q26132862'],"{'es': ['Anexo:Baloncesto en los Juegos Olímpicos de 2012', 'Anexo:Baloncesto en los Juegos Olímpicos de Londres 2012'], 'ro': ['Baschet la Jocurile Olimpice din 2012', 'Baschet la Jocurile Olimpice de vară din 2012'], 'pl': ['Koszykówka na Letnich Igrzyskach Olimpijskich 2012'], 'fr': [""basket-ball aux Jeux olympiques d'été de 2012""], 'it': ['Pallacanestro ai Giochi della XXX Olimpiade'], 'nl': ['basketbal op de Olympische Zomerspelen 2012'], 'pt': ['Basquetebol nos Jogos Olímpicos de Verão de 2012'], 'en': ['basketball at the 2012 Summer Olympics'], 'tr': [""2012 Yaz Olimpiyatları'nda basketbol""], 'uk': ['Баскетбол на літніх Олімпійських іграх 2012'], 'de': ['Olympische Sommerspiele 2012/Basketball'], 'ga': ['cispheil ag Cluichí Oilimpeacha an tSamhraidh 2012']}",,[],"{'Q145': ('', '')}",51.5486,-0.0139 +Q801124,Liverpool Street station,"['Q55485', 'Q55488', 'Q1793804']","{'nl': ['Liverpool Street', 'Liverpool Street station', 'Liverpool Street Station', 'Station Liverpool Street', 'Station London Liverpool Street'], 'de': ['Liverpool Street Station', 'Bahnhof Liverpool Street'], 'it': ['Stazione di Liverpool Street', 'stazione di Londra Liverpool Street'], 'fr': ['gare de Liverpool Street', 'Liverpool Street'], 'cy': ['Gorsaf reilffordd Liverpool Street', 'Gorsaf reilffordd Liverpool Street Llundain'], 'es': ['Estacion de Liverpool Street', 'London Liverpool Street', 'Liverpool Street', 'Estación de Liverpool Street'], 'pl': ['London Liverpool Street', 'Liverpool Street Station'], 'en': ['Liverpool Street railway station', 'Bishopsgate station', 'London Liverpool Street', 'Liverpool Street Overground station', 'Liverpool Street station'], 'en-gb': ['Liverpool Street railway station', 'London Liverpool Street', 'Bishopsgate station', 'Liverpool Street Underground station', 'Liverpool Street tube station', 'Liverpool Street station'], 'ro': ['gara Londra Liverpool Street', 'Londra Liverpool Street', 'gara Liverpool Street'], 'ga': ['Stáisiún Sráid Learpholl'], 'en-ca': ['Liverpool Street station'], 'tr': ['Liverpool Street İstasyonu'], 'uk': ['Ліверпуль-Стріт'], 'pt': ['Estação Liverpool Street']}",,['Q19186'],"{'Q145': ('', '')}",51.5186,-0.0813 +Q7737135,The Goldsmiths' Company Assay Office,,"{'en': [""The Goldsmiths' Company Assay Office""]}",,[],{},51.5157,-0.0959 +Q4834838,BBC Radio London,['Q14350'],"{'en': ['BBC London 94.9', 'Radio London', 'BBC Radio London'], 'en-gb': ['Radio London', 'BBC London 94.9', 'BBC Radio London'], 'pl': ['BBC London 94.9'], 'it': ['BBC London 94.9'], 'fr': ['BBC London 94.9']}",,[],"{'Q145': ('', '')}",51.5185,-0.1431 +Q17509255,Chiswell Street,['Q79007'],"{'en': ['Chiswell Street'], 'nl': ['Chiswell Street'], 'fr': ['Chiswell Street']}",,['Q19186'],"{'Q145': ('', '')}",51.5207,-0.089944 +Q951830,Royal Mint,"['Q464780', 'Q270791']","{'en': ['The Royal Mint', 'The Royal Mint (UK)', 'Royal Mint'], 'cy': ['Bathdy Brenhinol', 'y Bathdy Brenhinol'], 'nl': ['The Royal Mint (UK)', 'Royal Mint'], 'fr': ['Royal Mint'], 'de': ['Royal Mint'], 'it': ['Royal Mint'], 'es': ['Royal Mint'], 'pt': ['Royal Mint (Reino Unido)']}",,[],"{'Q145': ('', '')}",51.555,-3.387 +Q800753,Fenchurch Street railway station,"['Q55488', 'Q55485']","{'de': ['Fenchurch Street Station', 'Bahnhof Fenchurch Street'], 'nl': ['Station Fenchurch Street', 'Station London Fenchurch Street'], 'pt': ['Fenchurch Street (Londres)', 'Estação de Fenchurch Street', 'Fenchurch Street'], 'it': ['Stazione di Fenchurch Street', 'stazione di Londra Fenchurch Street'], 'en': ['London Fenchurch Street', 'Fenchurch Street', 'Fenchurch Street railway station'], 'fr': ['gare de Fenchurch Street'], 'en-ca': ['Fenchurch Street railway station'], 'en-gb': ['Fenchurch Street railway station'], 'es': ['Estación de Fenchurch Street'], 'uk': ['Фенчерч-стрит (станція)']}",,['Q19186'],"{'Q145': ('', '')}",51.511667,-0.078611 +Q6671078,London bid for the 2012 Summer Olympics,['Q938381'],"{'en': ['London bid for the 2012 Summer Olympics'], 'es': ['Candidatura de Londres a los Juegos Olímpicos de 2012']}",,[],"{'Q145': ('', '')}",51.54615,-0.01269 +Q186309,Madame Tussauds,['Q667018'],"{'pl': [""Madame Tussaud's"", 'Muzeum Figur Woskowych Madame Tussaud', 'Muzeum Figur Woskowych Madame Tussaud w Londynie'], 'fr': ['Musée Madame Tussauds', 'Madame Tussaud', 'Musée de Madame Tussauds', 'Madame Tussauds'], 'es': [""Madame Tussaud's Las Vegas"", 'Madame Tussauds Las Vegas', 'Madame Tussauds', 'Museo Madame Tussaud', 'Museo Madame Tussauds'], 'it': ['Madame Tussaud', ""Madame Tussaud's"", 'Madame Tussauds'], 'de': ['Madame Tussaud', ""Madame Tussaud's Wachsfigurenkabinett"", 'Madame Tussaud’s', ""Madame Tussaud's"", 'Madame Tussaud’s Waxwork Museum', 'Grosholtz', 'Philippe Curtius', ""Madame Tussaud's Waxwork Museum"", 'Madame Tussauds Wachsfigurenkabinett', 'Wachsfiguren-Kabinett', 'Tussaud', 'Wachsfigurenausstellung der Madame Tussand,', 'Madame Tussauds'], 'nl': ['Madame Tussaud', 'Madam Tussaud', 'Madame Tussauds'], 'pt': ['Madame Tussaud', 'Museu Madame Tussauds', 'Madame Tussauds'], 'tr': ['Madame Tussaud', 'Madamme Tussauds', 'Tussaud Müzesi', 'Madame Tussauds'], 'ro': [""Madame tussaud's"", 'Madame Tussauds'], 'en': ['Madame Tussauds'], 'uk': ['Музей мадам Тюссо'], 'ga': ['Madame Tussauds'], 'sco': ['Madame Tussauds']}",,['Q19186'],"{'Q145': ('', '')}",51.52279,-0.15517 +Q148349,Lambeth,"['Q2755753', 'Q149621']","{'es': ['Distrito de Lambeth', 'Lambeth'], 'nl': ['Lambeth'], 'en': ['Lambeth'], 'en-gb': ['Lambeth'], 'fr': ['Lambeth'], 'it': ['Lambeth'], 'de': ['Lambeth'], 'pl': ['Lambeth'], 'uk': ['Ламбет'], 'cy': ['Lambeth'], 'pt': ['Lambeth'], 'ga': ['Lambeth']}",,['Q67443130'],"{'Q145': ('', '')}",51.4903,-0.1193 +Q212883,diving at the 2012 Summer Olympics,['Q26132862'],"{'pl': ['Skoki do wody 2012', 'Skoki do wody na Letnich Igrzyskach Olimpijskich 2012'], 'fr': ['Plongeon aux Jeux olympiques de 2012', ""plongeon aux Jeux olympiques d'été de 2012""], 'ro': ['Sărituri în apă la Jocurile Olimpice din 2012', 'Sărituri natație la Jocurile Olimpice din 2012', 'Sărituri în apă la Jocurile Olimpice de vară din 2012'], 'es': ['Anexo:Saltos en los Juegos Olímpicos de Londres 2012'], 'it': ['Tuffi ai Giochi della XXX Olimpiade'], 'en': ['diving at the 2012 Summer Olympics'], 'nl': ['schoonspringen op de Olympische Zomerspelen 2012'], 'pt': ['Saltos ornamentais nos Jogos Olímpicos de Verão de 2012'], 'tr': [""2012 Yaz Olimpiyatları'nda atlama""], 'uk': ['Стрибки у воду на літніх Олімпійських іграх 2012'], 'de': ['Olympische Sommerspiele 2012/Wasserspringen'], 'ga': ['tumadóireacht ag Cluichí Oilimpeacha an tSamhraidh 2012']}",,[],"{'Q145': ('', '')}",51.5402,-0.0106 +Q195436,Tate Britain,"['Q207694', 'Q17431399']","{'pl': ['Tate Gallery', 'Tate Britain'], 'es': ['Tate Gallery', 'Tate Britain'], 'de': ['Tate Gallery of British Art', 'National Gallery of British Art', 'Tate Gallery', 'Tate Britain'], 'fr': ['National Gallery of British Art', 'Tate Gallery', 'Tate Britain'], 'en': ['National Gallery of British Art', 'Tate Gallery', 'Tate Gallery of British Art', 'Tate Britain'], 'tr': ['Tate Britain'], 'it': ['Tate Britain'], 'cy': ['Tate Britain'], 'nl': ['Tate Britain'], 'uk': ['Тейт Британія'], 'en-gb': ['Tate Britain'], 'pt': ['Tate Modern']}",['Tate Britain'],['Q19186'],"{'Q145': ('', '')}",51.490833,-0.127222 +Q5038252,Cardboard City,['Q486972'],"{'en': ['Cardboard City'], 'nl': ['Cardboard City']}",,['Q67443130'],"{'Q145': ('', '')}",51.505,-0.113611 +Q743535,Chelsea,"['Q2755753', 'Q1115575']","{'en-ca': ['Chelsea, London', 'Chelsea'], 'en': ['Chelsea, London', 'Chelsea'], 'cy': ['Chelsea, London', 'Chelsea'], 'de': ['Chelsea, London', 'Chelsea'], 'en-gb': ['Chelsea, London', 'Chelsea'], 'es': ['Chelsea (Londres)', 'Chelsea'], 'fr': ['Chelsea, London', 'Chelsea'], 'ga': ['Chelsea, London', 'Chelsea'], 'gd': ['Chelsea, London', 'Chelsea'], 'it': ['Chelsea, London', 'Chelsea'], 'nl': ['Chelsea, London', 'Chelsea'], 'pl': ['Chelsea, London', 'Chelsea'], 'pt': ['Chelsea, London', 'Chelsea'], 'ro': ['Chelsea, London', 'Chelsea'], 'sco': ['Chelsea, London', 'Chelsea'], 'tr': ['Chelsea'], 'uk': ['Челсі']}",,['Q19186'],"{'Q145': ('1922', ''), 'Q174193': ('1801', '1922'), 'Q161885': ('1707', '1800'), 'Q179876': ('', '1707')}",51.4875,-0.1684 +Q83609,Acton,"['Q3957', 'Q2755753', 'Q1115575']","{'en': ['Acton, London', 'Acton'], 'es': ['Acton (Londres)', 'Acton'], 'pl': ['Acton'], 'ro': ['Acton'], 'ga': ['Acton'], 'nl': ['Acton'], 'cy': ['Acton, Llundain'], 'en-gb': ['Acton'], 'fr': ['Acton'], 'tr': ['Acton, Londra'], 'uk': ['Ектон (Лондон)'], 'it': ['Acton']}",,['Q19186'],"{'Q145': ('', '')}",51.510519,-0.262661 +Q79348,London,['Q1093829'],"{'en': ['London, Arkansas', 'London, AR', 'London'], 'es': ['London, Arkansas', 'Londres, Arkansas', 'London'], 'pt': ['London'], 'pl': ['London'], 'nl': ['London'], 'fr': ['London'], 'it': ['London'], 'uk': ['Лондон'], 'cy': ['London, Arkansas'], 'de': ['London'], 'ga': ['London']}",,[],"{'Q30': ('', '')}",35.3258,-93.2367 +Q193196,University College London,"['Q15407956', 'Q4671277', 'Q38723', 'Q5341295']","{'fr': ['University College of London', 'University College London', 'University College', 'University College de Londres'], 'es': ['University College of London', 'University College London', 'Escuela Universitaria de Londres', 'University College de Londres'], 'it': ['University College of London', 'University College London', 'University College di Londra', 'University College'], 'de': ['University College in London', 'University College London'], 'pt': ['University College', 'University College London'], 'tr': ['University College', 'Londra Üniversitesi Akademisi', 'Londra Üniversitesi Koleji'], 'en': ['University College, London', 'London University', ""London's Global University"", 'University College London'], 'pl': ['University College London'], 'nl': ['University College London'], 'ro': ['University College London'], 'uk': ['Університетський коледж Лондона'], 'sco': ['University College London'], 'cy': ['Coleg Prifysgol Llundain'], 'en-gb': ['University College London'], 'gd': ['Colaiste Oilthigh Lunnainn'], 'ga': ['University College London'], 'kw': ['Kollji Pennskol Loundres']}",,['Q19186'],"{'Q145': ('', '')}",51.524722,-0.133611 +Q4801470,Arts Educational School,"['Q2418495', 'Q2143781']","{'en': ['ArtsEd', 'Arts Educational School'], 'en-ca': ['Arts Educational Schools, London'], 'en-gb': ['Arts Educational School'], 'cy': ['Arts Educational Schools, Llundain'], 'nl': ['Arts Educational School']}",,['Q19186'],"{'Q145': ('', '')}",51.4961,-0.2525 +Q220198,Zoological Society of London,"['Q748019', 'Q45400320', 'Q1966910']","{'es': ['Sociedad Zoologica de Londres', 'Sociedad Zoológica de Londres'], 'fr': ['Zoological Society of London', 'Société zoologique de Londres'], 'en': ['Zoological Society of London'], 'nl': ['Zoological Society of London'], 'pt': ['Sociedade Zoológica de Londres'], 'de': ['Zoological Society of London'], 'pl': ['Zoological Society of London'], 'it': ['Società Zoologica di Londra'], 'gd': ['Comann Ainmh-eòlas Lunnainn'], 'cy': ['Cymdeithas Swoleg Llundain'], 'uk': ['Зоологічне товариство Лондона'], 'tr': ['Londra Zooloji Topluluğu'], 'ga': ['Cumann Zó-eolaíochta Londan']}",,['Q19186'],"{'Q145': ('', '')}",51.5357,-0.1575 +Q124234,St James’s,['Q2755753'],"{'de': [""St James's"", 'St. James’s'], 'en': ['St. James’s', 'St James’s, London', 'St James’s'], 'it': [""St James's"", ""Saint James's"", ""St. James's""], 'fr': [""St. James's""], 'es': [""St James's""], 'ga': [""St James's""], 'nl': [""St James's""], 'cy': [""St James's""]}",,['Q19186'],"{'Q145': ('', '')}",51.5085,-0.133 +Q23298,Kent,['Q180673'],"{'de': ['Zeremonielle Grafschaft Kent', 'Kent'], 'en': ['Kent, England', 'Kent'], 'tr': ['Törensel Kent Kontluğu', 'Kent'], 'uk': ['графство Кент', 'церемоніальне графство Кент', 'Кент'], 'en-gb': ['Kent'], 'en-ca': ['Kent'], 'cy': ['Caint'], 'es': ['Kent'], 'fr': ['Kent'], 'ga': ['Kent'], 'it': ['Kent'], 'kw': ['Kint'], 'nl': ['Kent'], 'pl': ['Kent'], 'pt': ['Kent'], 'ro': ['Kent'], 'sco': ['Kent']}",,['Q67479626'],"{'Q145': ('1927', ''), 'Q179876': ('', '1707')}",51.19,0.73 +Q1431914,Croydon Airport,['Q644371'],"{'de': ['Croydon Airport', 'RAF Croydon', 'Flughafen London-Croydon'], 'fr': ['base aérienne militaire du Royaume-Uni', 'aéroport de Croydon'], 'en': ['Croydon Airport'], 'it': ['Aeroporto di Croydon'], 'nl': ['Croydon Airport'], 'pt': ['Aeroporto de Croydon'], 'ro': ['Aeroportul Croydon'], 'es': ['Aeropuerto de Croydon']}",,['Q67443130'],"{'Q145': ('', '')}",51.356361,-0.116822 +Q835031,"Embassy of Germany, London",['Q3917681'],"{'pl': ['Niemieccy ambasadorzy w Wielkiej Brytanii', 'Niemmieccy ambasadorzy w Wielkiej Brytanii', 'Ambasadorowie Niemiec w Wielkiej Brytanii'], 'de': ['Deutsche Botschaft London'], 'en': ['Embassy of Germany, London'], 'fr': [""ambassade d'Allemagne au Royaume-Uni""], 'uk': ['Посольство Німеччини у Великій Британії'], 'es': ['embajada de Alemania en el Reino Unido']}",,['Q19186'],"{'Q145': ('', '')}",51.49825,-0.15425 +Q1323689,BFI London Film Festival,['Q220505'],"{'fr': ['Festival de Londres', 'British Film Institute Awards', 'London Film Festival', 'Festival du film de Londres'], 'es': ['London Film Festival', 'Festival de Cine de Londres'], 'tr': ['Londra Uluslararası Film Festivali', 'Londra Film Festivali'], 'de': ['The Times bfi London Film Festival', 'BFI London Film Festival', 'BFI Festival', 'London Film Festival'], 'nl': ['London Film Festival', 'The Times BFI London Film Festival', 'Filmfestival van Londen'], 'en': ['London Film Festival', 'BFI Film Festival', 'BFI London Festival', 'BFI Festival', 'BFI London Film Festival'], 'pl': ['BFI London Film Festival', 'London Film Festival', 'BFI Film Festival', 'BFI London Festival', 'BFI Festival', 'Festiwal Filmowy w Londynie'], 'it': ['BFI London Film Festival'], 'en-ca': ['BFI London Film Festival'], 'en-gb': ['BFI London Film Festival'], 'pt': ['Festival de Cinema de Londres'], 'uk': ['Лондонський кінофестиваль']}",['BFI London Film Festival'],[],"{'Q145': ('', '')}",51.506389,-0.115278 +Q7594521,St Mary's Roman Catholic Church,['Q1088552'],"{'en': ['Roman Catholic Church of St Mary (Church of the Redemptionist Fathers)', ""St Mary's Roman Catholic Church, Clapham"", ""St Mary's Roman Catholic Church""]}",,['Q67443130'],"{'Q145': ('', '')}",51.4616,-0.13743 +Q26888,London Borough of Croydon,"['Q211690', 'Q7897276']","{'fr': ['London Borough of Croydon', 'Croydon', 'London Borough de Croydon', 'Borough londonien de Croydon'], 'it': ['borgo londinese di Croydon', 'Croydon'], 'nl': ['London Borough of Croydon', 'Croydon'], 'pt': ['London Borough of Croydon', 'Borough de Croydon', 'London Borough de Croydon', 'Croydon'], 'ro': ['Burgul londonez Croydon', 'Croydon'], 'en': ['Croydon', 'Croydon (unparished area)', 'London Borough of Croydon'], 'pl': ['London Borough of Croydon'], 'es': ['Croydon'], 'de': ['London Borough of Croydon'], 'ga': ['Buirg Londan Croydon'], 'uk': ['Кройдон'], 'cy': ['Bwrdeistref Llundain Croydon'], 'tr': ['Croydon']}",,['Q67443130'],"{'Q145': ('', '')}",51.371111,-0.098889 +Q8709,London Stansted Airport,"['Q644371', 'Q94993988']","{'en': ['Stansted Airport', 'Stansted', 'London Stansted', 'London Stansted Airport'], 'it': ['London Stansted Airport', 'Londra Stansted', 'Aeroporto di Londra Stansted'], 'fr': ['Stansted', 'Londres-Stansted', 'aéroport de Londres Stansted'], 'nl': ['London Stansted Airport', 'Luchthaven Londen Stansted'], 'ro': ['London Stansted Airport', 'Aeroportul Londra Stansted'], 'es': ['Londres-Stansted', 'Aeropuerto de Londres-Stansted'], 'cy': ['Maes Awyr Stansted'], 'de': ['Flughafen London-Stansted'], 'pl': ['Port lotniczy Londyn-Stansted'], 'pt': ['Aeroporto de Londres Stansted'], 'ga': ['Aerfort Londain Stansted'], 'tr': ['Londra Stansted Havalimanı'], 'uk': ['Лондон-Станстед']}",['London Stansted Airport'],['Q67442940'],"{'Q145': ('', '')}",51.885,0.235 +Q1402606,BAPS Shri Swaminarayan Mandir London,"['Q106807864', 'Q842402']","{'en': ['Neasden Temple', 'BAPS Shri Swaminarayan Mandir London'], 'de': ['Neasden-Tempel'], 'nl': ['Neasdentempel'], 'pl': ['BAPS Shri Swaminarayan Mandir w Londynie'], 'it': ['Shri Swaminarayan Mandir'], 'fr': ['Neasden Temple'], 'uk': ['Шрі Свамінараян Мандір']}",,['Q19186'],"{'Q145': ('', '')}",51.5475,-0.261667 +Q278054,"Roman Catholic Diocese of London, Ontario",['Q3146899'],"{'en': ['Diocese of London', 'Roman Catholic Diocese of London, Ontario'], 'es': ['Diócesis de Londres', 'Diócesis de London'], 'de': ['Bistum London (Ontario)'], 'it': ['diocesi di London'], 'pl': ['Diecezja London'], 'fr': ['diocèse de London'], 'nl': ['Bisdom London'], 'pt': ['Diocese de London'], 'ga': ['Deoise Chaitliceach London, Ontario']}",,[],"{'Q16': ('', '')}",42.9876,-81.25 +Q801125,London Bridge station,"['Q55488', 'Q55490', 'Q55485']","{'pt': ['Estação da London Bridge (Metro de Londres)', 'Estação de London Bridge', 'Estação da London Bridge', 'London Bridge (Metropolitano de Londres)', 'London Bridge (Metrô de Londres)', 'London Bridge (Metro de Londres)', 'Estação da London Bridge (Metrô de Londres)', 'Estação London Bridge'], 'de': ['London Bridge Station', 'Bahnhof London Bridge'], 'en': ['London Bridge railway station', 'London Bridge station'], 'cy': ['Gorsaf reilffordd London Bridge', 'Gorsaf London Bridge'], 'es': ['Estacion de London Bridge', 'Estación de London Bridge'], 'fr': ['gare de London Bridge', 'London Bridge'], 'nl': ['Station London Bridge'], 'it': ['stazione di London Bridge'], 'pl': ['London Bridge Station'], 'en-ca': ['London Bridge station'], 'en-gb': ['London Bridge station'], 'tr': ['London Bridge İstasyonu'], 'uk': ['Лондон-брідж (станція)']}",,['Q67443130'],"{'Q145': ('', '')}",51.505,-0.086111 +Q205679,London Borough of Hackney,"['Q211690', 'Q7897276']","{'it': ['London Borough of Hackney', 'Borgo londinese di Hackney', 'Hackney'], 'de': ['Stoke Newington', 'De Beauvoir Town', 'Hackney Wick', 'Upper Clapton', 'Haggerston', 'Hoxton', 'London Borough of Hackney'], 'nl': ['Londen Borough of Hackney', 'London Borough of Hackney', 'Hackney'], 'pt': ['Borough de Hackney', 'London Borough de Hackney', 'Hackney'], 'ro': ['Burgul londonez Hackney', 'Hackney'], 'en': ['Hackney', 'Hackney (unparished area)', 'London Borough of Hackney'], 'fr': ['London Borough of Hackney', 'borough londonien de Hackney'], 'cy': ['Hackney (bwrdeistref)', 'Bwrdeistref Llundain Hackney'], 'pl': ['London Borough of Hackney'], 'es': ['Hackney'], 'ga': ['London Borough of Hackney'], 'uk': ['Гекні'], 'sco': ['London Borough of Hackney'], 'en-ca': ['London Borough of Hackney'], 'en-gb': ['London Borough of Hackney'], 'tr': ['Hackney']}",,['Q19186'],"{'Q145': ('', '')}",51.544722,-0.0575 +Q6669870,London Book Fair,"['Q11483816', 'Q998672', 'Q57305']","{'en': ['London Book Fair'], 'de': ['London Book Fair'], 'es': ['London Book Fair'], 'fr': ['London Book Fair'], 'it': ['London Book Fair'], 'nl': ['London Book Fair'], 'pt': ['London Book Fair'], 'en-gb': ['London Book Fair']}",,['Q19186'],"{'Q145': ('', '')}",51.496,-0.211 +Q1666958,London International Surrealist Exhibition,['Q59861107'],"{'es': ['Exposición Surrealista', 'Exposición Internacional Surrealista', 'Exposición Internacional Surrealista de Londres', 'Exposición Surrealista de Londres', 'Exposición Surrealista Internacional de Londres'], 'de': ['International Surrealist Exhibition'], 'en': ['London International Surrealist Exhibition'], 'fr': ['International Surrealist Exhibition']}",,[],"{'Q145': ('', '')}",51.509722,-0.141111 +Q5011830,CIQM-FM,['Q14350'],{},,[],"{'Q16': ('', '')}",42.9556,-81.3553 +Q1394500,South London,['Q7631958'],"{'pt': ['Sul de Londres', 'South London'], 'en': ['London/South', 'South London'], 'fr': ['Londres-Sud', 'South London'], 'de': ['South London'], 'nl': ['Zuid-Londen'], 'it': ['Sud di Londra'], 'cy': ['South London'], 'es': ['Londres del Sur'], 'sco': ['Sooth Lunnon'], 'uk': ['Південний Лондон'], 'en-gb': ['South London']}",['South London'],['Q67443130'],"{'Q145': ('', '')}",51.45,-0.1 +Q772421,"St George's, University of London","['Q494230', 'Q2467461', 'Q5341295']","{'en': [""St George's Hospital Medical School"", ""St George's University of London"", ""University of London Saint George's"", ""Saint George's Hospital Medical School"", ""St George's, University of London""], 'en-gb': [""St George's Hospital Medical School"", ""University of London Saint George's"", ""Saint George's Hospital Medical School"", ""St George's, University of London""], 'de': ['St George’s, University of London'], 'cy': [""St George's, Prifysgol Llundain""], 'it': [""St. George's Hospital Medical School""], 'uk': ['Коледж Святого Джорджа'], 'fr': [""St George's, University of London""]}",,['Q67443130'],"{'Q145': ('', '')}",51.426944,-0.174722 +Q1749569,Ny-London,['Q2940297'],"{'en': ['London', 'Camp Mansfield', 'Ny-London'], 'de': ['Ny-London'], 'it': ['Ny-London']}",,[],"{'Q20': ('', '')}",78.963333,12.047778 +Q60578265,London,['Q27990982'],"{'en': ['City of London', 'London'], 'fr': ['cité de Londres'], 'ga': ['Londain']}",,[],"{'Q179876': ('', '')}",51.515556,-0.093056 +Q39121,Leeds,"['Q515', 'Q7897276', 'Q1549591', 'Q1187811']","{'es': ['Ciudad de Leeds', 'Cross Gates', 'Leeds'], 'pl': ['Leeds'], 'ga': ['Leeds'], 'nl': ['Leeds'], 'pt': ['Leeds'], 'tr': ['Leeds'], 'uk': ['Лідс'], 'sco': ['Leeds'], 'fr': ['Leeds'], 'it': ['Leeds'], 'de': ['Leeds'], 'en': ['Leeds'], 'ro': ['Leeds'], 'cy': ['Leeds'], 'en-ca': ['Leeds'], 'en-gb': ['Leeds'], 'gd': ['Leeds']}",['Leeds'],['Q163'],"{'Q145': ('', '')}",53.7975,-1.543611 +Q1466941,Leeds railway station,"['Q55488', 'Q7886778']","{'en': ['Leeds City station', 'Leeds station', 'Leeds City railway station', 'Leeds railway station'], 'it': ['stazione di Leeds City', 'stazione di Leeds'], 'de': ['Leeds City Station'], 'cy': ['Gorsaf reilffordd Leeds'], 'fr': ['gare de Leeds'], 'nl': ['station Leeds'], 'pl': ['Leeds City'], 'ro': ['gara Leeds']}",,['Q163'],"{'Q145': ('', '')}",53.794,-1.547 +Q1128631,Leeds United F.C.,['Q476028'],"{'it': ['Leeds United', 'Leeds United AFC', 'Leeds United FC', 'Leeds United Football Club', 'Leeds United A.F.C.', 'Leeds United Association Football Club', 'Leeds United F.C.'], 'ga': ['Leeds United A.F.C', 'Leeds United', 'Leeds United F.C.', 'Leeds United A.F.C.', 'Leeds United Association Football Club'], 'es': ['Leeds United F.C.', 'Leeds United FC', 'Leeds United', 'Leeds United A.F.C.', 'Leeds United Football Club', 'Leeds United A F C', 'Leeds United F C', 'Leeds United A F.C.', 'Leeds United A F C.', 'Leeds United F C.', 'Leeds United Association Football Club'], 'pt': ['Leeds United FC', 'Leeds FC', 'Leeds United', 'Leeds fc', 'Leeds united', 'Leeds united afc', 'Leeds united fc', 'Leeds United AFC', 'Leeds united association football club', 'Leeds United A.F.C.', 'Leeds United F.C.', 'Leeds United Association Football Club'], 'tr': ['Leeds United F.C.', 'Leeds United A.F.C.', 'Leeds United', 'Leeds United FC', 'Leeds United Association Football Club', 'Leeds United Football Club', 'The Whites', 'Leeds United AFC'], 'ro': ['Leeds United', 'Leeds United A.F.C.', 'Leeds United F.C.', 'Leeds united afc', 'Leeds United AFC'], 'pl': ['Leeds United', 'Leeds United F.C.', 'Leeds United Association Football Club', 'The Peacocks', 'The Whites', 'Leeds United A.F.C.'], 'fr': ['Leeds United FC', 'Leeds United A.Football Club', 'Leeds United Football Club', 'Leeds United Association Football Club', 'Leeds United AFC', 'Leeds United'], 'de': ['Leeds United F.C.', 'Leeds United A.F.C.', 'Leeds United LFC', 'Leeds United'], 'nl': ['Leeds United', 'Leeds United FC', 'Leeds United A.F.C.', 'Leeds United Football Club', 'Leeds United AFC'], 'uk': ['Лідс (футбольний клуб)', 'Лідс (ФК)', 'Лідс Юнайтед'], 'en': ['Leeds United Football Club', 'Leeds United FC', 'Leeds United', 'Leeds United Association Football Club', 'Leeds United A.F.C.', 'Leeds United AFC', 'Leeds', 'United', 'The Whites', 'The Peacocks', 'Leeds United F.C.'], 'sco': ['Leeds Unitit A.F.C.'], 'cy': ['Leeds United A.F.C.']}",['Leeds United F.C.'],[],"{'Q145': ('', '')}",53.777778,-1.572222 +Q774015,Leeds,"['Q21503295', 'Q1002812']","{'en': ['City of Leeds', 'City and Borough of Leeds', 'Leeds'], 'en-gb': ['City of Leeds', 'Leeds'], 'nl': ['City of Leeds'], 'de': ['City of Leeds'], 'fr': ['cité de Leeds'], 'it': ['City of Leeds'], 'pl': ['City of Leeds'], 'tr': ['Leeds Şehri'], 'en-ca': ['City of Leeds'], 'uk': ['Сіті-оф-Лідс'], 'cy': ['Dinas Leeds'], 'ga': ['Leeds']}",,[],"{'Q145': ('', '')}",53.799167,-1.549167 +Q503424,University of Leeds,"['Q62078547', 'Q5341295']","{'pl': ['Uniwersytet Leeds', 'Leeds University', 'University of Leeds'], 'fr': ['Leeds University', 'University of Leeds', 'université de Leeds'], 'es': ['University of Leeds', 'Universidad de Leeds'], 'de': ['Universität Leeds', 'University of Leeds'], 'nl': ['University of Leeds', 'Leeds University', 'Universiteit van Leeds'], 'pt': ['University of leeds', 'Universidade de Leeds'], 'ro': ['Universitatea Leeds', 'Universitatea din Leeds', 'University of Leeds'], 'uk': ['Лідський університет', 'Університет Лідса'], 'en': ['Leeds University', 'University of Leeds'], 'en-gb': ['Leeds University', 'University of Leeds'], 'en-ca': ['Leeds University', 'University of Leeds'], 'tr': ['Leeds Üniversitesi'], 'sco': ['Varsity o Leeds'], 'it': ['Università di Leeds'], 'cy': ['Prifysgol Leeds'], 'gd': ['Oilthigh Leeds'], 'ga': ['Ollscoil Leeds']}",,['Q163'],"{'Q145': ('', '')}",53.807222,-1.551667 +Q1137962,Reading and Leeds Festivals,['Q868557'],"{'pt': ['Reading Festival', 'Reading and leeds festival', 'Reading and leeds festivals', 'Leeds Festival', 'Festivais de Reading e Leeds'], 'fr': ['Reading Festival', 'Leeds Festival', 'Festival de Reading', 'Reading and Leeds Festivals'], 'es': ['Reading Festival', 'Festival de Reading y Leeds', 'Reading and Leeds Festivals', 'Leeds Festival', 'Festival de Leeds', 'Carling Weekend', 'Festival de Reading', 'Festivales de Reading y Leeds'], 'it': ['Reading Festival', 'Reading e Leeds festivals', 'Festival di Leeds', 'Festival di Reading', 'Reading and Leeds Festival', 'Leeds Festival', 'Carling Weekend', 'Reading/Leeds Festival', 'Festival di Reading e Leeds'], 'de': ['Reading Festival', 'Reading and Leeds Festivals'], 'en': ['Reading Festival', 'Leeds Festival', 'Reading Music Festival', 'Reading Music Fest', 'Reading Fest', 'Reading and Leeds Fest', 'Reading and Leeds Festivals'], 'nl': ['Reading en Leeds Festivals'], 'uk': ['Фестивалі Редінг і Лідс']}",,[],"{'Q145': ('', '')}",51.467222,-1.011944 +Q6515934,Leeds City bus station,['Q494829'],{'en': ['Leeds City bus station']},,['Q163'],"{'Q145': ('', '')}",53.7969,-1.53528 +Q4834918,BBC Radio Leeds,['Q14350'],"{'en': ['BBC Radio Leeds'], 'pl': ['BBC Radio Leeds'], 'en-ca': ['BBC Radio Leeds'], 'en-gb': ['BBC Radio Leeds']}",,[],"{'Q145': ('', '')}",53.797389,-1.533833 +Q7721041,The Calls,"['Q79007', 'Q123705']","{'en': ['The Calls'], 'ga': ['The Calls'], 'nl': ['The Calls'], 'fr': ['The Calls']}",,['Q163'],"{'Q145': ('', '')}",53.794,-1.538 +Q482468,Leeds,['Q15127012'],"{'en': ['Leeds, Utah', 'Leeds, UT', 'Leeds'], 'pt': ['Leeds'], 'es': ['Leeds'], 'nl': ['Leeds'], 'de': ['Leeds'], 'it': ['Leeds'], 'pl': ['Leeds'], 'fr': ['Leeds'], 'uk': ['Лідс'], 'cy': ['Leeds, Utah']}",,[],"{'Q30': ('', '')}",37.239444,-113.360833 +Q2460124,Leeds,"['Q532', 'Q1115575']","{'en': ['Leeds, Kent', 'Leeds'], 'cy': ['Leeds, Caint', 'Leeds'], 'nl': ['Leeds'], 'pl': ['Leeds'], 'de': ['Leeds'], 'fr': ['Leeds'], 'it': ['Leeds (Kent)'], 'tr': ['Leeds, Kent'], 'ga': ['Leeds']}",,['Q67479626'],"{'Q145': ('', '')}",51.246311,0.606631 +Q79869,Leeds,['Q1093829'],"{'en': ['Leeds, Alabama', 'Leeds, AL', 'Leeds'], 'es': ['Leeds (Alabama)', 'Leeds'], 'pt': ['Leeds'], 'uk': ['Лідс'], 'it': ['Leeds'], 'de': ['Leeds'], 'nl': ['Leeds'], 'fr': ['Leeds'], 'pl': ['Leeds (Alabama)'], 'tr': ['Leeds'], 'cy': ['Leeds, Alabama'], 'ga': ['Leeds']}",,[],"{'Q30': ('', '')}",33.545592,-86.557388 +Q746876,Leeds Castle,"['Q2087181', 'Q1343246', 'Q23413']","{'de': ['Leeds Castle'], 'en': ['Leeds Castle'], 'es': ['Castillo de Leeds'], 'fr': ['château de Leeds'], 'it': ['castello di Leeds'], 'nl': ['Leeds Castle'], 'pl': ['Zamek Leeds'], 'pt': ['Castelo de Leeds'], 'uk': ['Лідс'], 'ga': ['Caisleán Leeds']}",,['Q67479626'],"{'Q145': ('', '')}",51.2491,0.630411 +Q6515805,Leeds,"['Q751708', 'Q17343829']","{'en': ['Leeds, Massachusetts', 'Leeds, MA', 'Leeds'], 'fr': ['Leeds'], 'tr': ['Leeds, Massachusetts']}",,[],"{'Q30': ('', '')}",42.3514,-72.6994 +Q3461415,Leeds,"['Q498162', 'Q17343829']","{'en': ['Leeds, NY', 'Leeds'], 'es': ['Leeds'], 'it': ['Leeds'], 'de': ['Leeds'], 'fr': ['Leeds'], 'pl': ['Leeds'], 'uk': ['Лідс'], 'ga': ['Leeds']}",,[],"{'Q30': ('', '')}",42.2533,-73.8967 +Q2365261,Leeds,['Q1093829'],"{'en': ['Leeds, North Dakota', 'Leeds, ND', 'Leeds'], 'nl': ['Leeds'], 'es': ['Leeds'], 'it': ['Leeds'], 'pt': ['Leeds'], 'fr': ['Leeds'], 'pl': ['Leeds'], 'tr': ['Leeds'], 'uk': ['Лідс'], 'cy': ['Leeds, Gogledd Dakota'], 'ga': ['Leeds']}",,[],"{'Q30': ('', '')}",48.289444,-99.438889 +Q21061609,Headingley Cricket Ground,"['Q682943', 'Q483110']","{'en': ['Headingley Carnegie Cricket Stadium', 'Emerald Headingley Cricket Ground', 'Headingley Carnegie Cricket Ground', 'Headingley Cricket Ground'], 'fr': ['Headingley Cricket Ground'], 'en-gb': ['Headingley Cricket Ground'], 'it': ['Headingley Cricket Ground']}",,['Q163'],"{'Q145': ('', '')}",53.8177,-1.58198 +Q7746609,The Leeds Studios,['Q811979'],{'en': ['The Leeds Studios']},,['Q163'],"{'Q145': ('', '')}",53.803366,-1.570106 +Q14875251,Leeds County,['Q4204495'],{'en': ['Leeds County']},,[],"{'Q16': ('', '')}",44.5833,-76.0 +Q6515927,Leeds City Region,['Q618123'],{'en': ['Leeds City Region']},,[],"{'Q145': ('', '')}",53.8,-1.549 +Q5177618,County Borough of Leeds,['Q1137272'],{'en': ['County Borough of Leeds']},,[],"{'Q145': ('', '')}",53.799722,-1.549167 +Q8699,Leeds Bradford Airport,"['Q644371', 'Q94993988']","{'en': ['Leeds Bradford International Airport', 'Leeds Bradford Airport'], 'fr': ['aérodrome de Royaume Uni', 'aéroport international de Leeds-Bradford'], 'ro': ['Aeroportul Leeds Bradford', 'Aeroportul Internațional Leeds Bradford'], 'cy': ['Maes Awyr Rhyngwladol Leeds Bradford'], 'de': ['Leeds Bradford International Airport'], 'es': ['Aeropuerto Internacional de Leeds Bradford'], 'it': ['Aeroporto di Leeds-Bradford'], 'nl': ['Leeds Bradford International Airport'], 'pl': ['Port lotniczy Leeds/Bradford'], 'pt': ['Aeroporto Internacional de Leeds Bradford'], 'uk': ['Лідс-Бредфорд']}",,['Q163'],"{'Q145': ('', '')}",53.865833,-1.660556 +Q27985411,"Leeds, Kansas City",['Q123705'],"{'en': ['Leeds, Kansas City'], 'nl': ['Leeds, Kansas City']}",,[],"{'Q30': ('', '')}",39.055838,-94.508565 +Q6515866,Leeds Central railway station,['Q55488'],"{'en': ['Leeds Central railway station'], 'nl': ['station Leeds Central']}",,['Q163'],"{'Q145': ('', '')}",53.7958,-1.5547 +Q871138,Roman Catholic Diocese of Leeds,['Q3146899'],"{'en': ['Diocese of Leeds', 'Roman Catholic Diocese of Leeds'], 'fr': ['diocèse de Leeds'], 'de': ['Bistum Leeds'], 'it': ['diocesi di Leeds'], 'pl': ['Diecezja Leeds'], 'nl': ['Bisdom Leeds'], 'es': ['diócesis de Leeds'], 'ga': ['Deoise Chaitliceach Leeds']}",,[],"{'Q145': ('', '')}",53.7833,-1.53333 +Q4763489,Anglican Diocese of Leeds,['Q18917976'],"{'en': ['Diocese of Leeds', 'Diocese of West Yorkshire and the Dales', 'Anglican Diocese of Leeds'], 'en-gb': ['Diocese of West Yorkshire and the Dales', 'Anglican Diocese of Leeds'], 'it': ['diocesi anglicana di Leeds', 'diocesi di Leeds'], 'pl': ['Diecezja Leeds'], 'es': ['Diócesis anglicana de Leeds'], 'de': ['Anglikanische Diözese Leeds'], 'nl': ['Bisdom Leeds']}",,[],"{'Q145': ('', '')}",53.7998,-1.5305 +Q4871546,Battle of Leeds,['Q178561'],"{'en': ['Battle of Leeds'], 'es': ['Batalla de Leeds'], 'it': ['Battaglia di Leeds']}",,[],"{'Q145': ('', '')}",53.7969,-1.5424 +Q24896243,Elland Road Greyhound Stadium,"['Q483110', 'Q45290083']",{'en': ['Elland Road Greyhound Stadium']},,['Q163'],"{'Q145': ('', '')}",53.775,-1.575 +Q1187032,Headingley Stadium,['Q1076486'],"{'de': ['Headingley-Carnegie-Stadion', 'Headingley Carnegie-Stadion', 'Headingley Carnegie Stadium', 'Headingley-Stadion', 'Headingley Stadium'], 'fr': ['Headingley Stadium', 'Headingley Carnegie Stadium'], 'en': ['Headingley Stadium'], 'nl': ['Headingley Stadium']}",,['Q163'],"{'Q145': ('', '')}",53.817661,-1.581978 +Q489255,Sioux City,"['Q1093829', 'Q62049']","{'en': ['Sioux City, Iowa', 'Sioux City, IA', 'Sioux City'], 'es': ['Sioux City (Iowa)', 'Sioux City'], 'pl': ['Sioux City'], 'gd': ['Sioux City'], 'fr': ['Sioux City'], 'it': ['Sioux City'], 'de': ['Sioux City'], 'kw': ['Sioux City'], 'nl': ['Sioux City'], 'pt': ['Sioux City'], 'cy': ['Sioux City'], 'uk': ['Су-Сіті'], 'sco': ['Sioux City'], 'tr': ['Sioux City'], 'ga': ['Sioux City']}",['Sioux City'],[],"{'Q30': ('', '')}",42.498056,-96.395556 +Q3228965,Leeds Arts University,"['Q383092', 'Q5341295']","{'fr': [""Ecole d'Art de Leeds"", 'Leeds School of Art', 'Leeds college of art and design', 'Leeds College Of Art And Design', ""École d'Art de Leeds"", 'Leeds College of Arts', 'Leeds College of Art and Design'], 'en': ['Jacob Kramer College', 'Leeds College Of Art And Design', 'Leeds College of Art', 'Leeds Arts University'], 'en-gb': ['Jacob Kramer College', 'Leeds College Of Art And Design', 'Leeds College of Art', 'Leeds Arts University'], 'es': ['Leeds College of Art'], 'de': ['Leeds College of Art'], 'nl': ['Leeds Arts University'], 'it': ['Leeds Arts University']}",,['Q163'],"{'Q145': ('', '')}",53.8084,-1.5517 +Q209266,Leeds,['Q15127012'],"{'en': ['Leeds, ME', 'Leeds'], 'es': ['Leeds (Maine)', 'Leeds'], 'fr': ['Leeds'], 'nl': ['Leeds'], 'de': ['Leeds'], 'it': ['Leeds'], 'pl': ['Leeds'], 'uk': ['Лідс'], 'cy': ['Leeds, Maine'], 'tr': ['Leeds, Maine']}",,[],"{'Q30': ('', '')}",44.303333,-70.119167 +Q42448,Sheffield,"['Q515', 'Q1549591', 'Q7897276']","{'es': ['Steel City', 'Sheffield'], 'uk': ['Шефілд', 'Шеффілд'], 'it': ['City of Sheffield', 'Sheffield'], 'en': ['Sheffield, South Yorkshire', 'Sheffield, England', 'Sheffield'], 'pl': ['Sheffield'], 'ga': ['Sheffield'], 'nl': ['Sheffield'], 'pt': ['Sheffield'], 'tr': ['Sheffield'], 'sco': ['Sheffield'], 'fr': ['Sheffield'], 'de': ['Sheffield'], 'ro': ['Sheffield'], 'cy': ['Sheffield'], 'en-ca': ['Sheffield'], 'en-gb': ['Sheffield'], 'gd': ['Sheffield'], 'kw': ['Sheffield']}",['Sheffield'],['Q163'],"{'Q145': ('', '')}",53.380833,-1.470278 +Q7492778,Sheffield Victoria railway station,['Q55488'],"{'en': ['Sheffield Victoria railway station'], 'nl': ['station Sheffield Victoria']}",,['Q163'],"{'Q145': ('', '')}",53.3875,-1.45876 +Q7492565,"Sheffield, Cornwall",['Q532'],"{'cy': ['Sheffield, Cernyw', 'Sheffield'], 'en': ['Sheffield, Cornwall'], 'pl': ['Sheffield'], 'fr': ['Sheffield'], 'nl': ['Sheffield, Cornwall'], 'ga': ['Sheffield, Corn na Breataine']}",,['Q23148'],"{'Q145': ('', '')}",50.0868,-5.555 +Q1862179,Sheffield station,['Q55488'],"{'en': ['Pond Street', 'Sheffield Midland', 'Sheffield', 'Sheffield Station And Attached Bridges And Platform Bridges', 'Sheffield railway station', 'Sheffield station'], 'nl': ['Station Sheffield', 'station Sheffield'], 'de': ['Sheffield station', 'Bahnhof Sheffield'], 'cy': ['Gorsaf reilffordd Sheffield'], 'fr': ['gare de Sheffield'], 'pl': ['Sheffield'], 'it': ['stazione di Sheffield'], 'en-ca': ['Sheffield station'], 'en-gb': ['Sheffield station'], 'ro': ['gara Sheffield']}",,['Q163'],"{'Q145': ('', '')}",53.377778,-1.462222 +Q823917,University of Sheffield,"['Q62078547', 'Q5341295']","{'fr': ['Universite de Sheffield', 'University of Sheffield', 'université de Sheffield'], 'tr': ['University of Sheffield', 'Sheffield Üniversitesi'], 'ro': ['University of Sheffield', 'Universitatea Sheffield'], 'it': ['University of Sheffield', 'Università di Sheffield'], 'de': ['University of Sheffield', 'Universität Sheffield'], 'nl': ['University of Sheffield', 'Universiteit van Sheffield'], 'en': ['Sheffield University', 'The University of Sheffield', 'University of Sheffield'], 'en-gb': ['The University of Sheffield', 'University of Sheffield'], 'es': ['University of Sheffield', 'Universidad de Sheffield'], 'pl': ['University of Sheffield'], 'cy': ['Prifysgol Sheffield'], 'pt': ['Universidade de Sheffield'], 'uk': ['Університет Шеффілда'], 'gd': ['Oilthigh Sheffield'], 'ga': ['Ollscoil Sheffield']}",['University of Sheffield'],['Q163'],"{'Q145': ('', '')}",53.380722,-1.488806 +Q4834926,BBC Radio Sheffield,['Q14350'],"{'en': ['BBC Radio Sheffield'], 'pl': ['BBC Radio Sheffield'], 'en-ca': ['BBC Radio Sheffield'], 'en-gb': ['BBC Radio Sheffield'], 'it': ['BBC Radio Sheffield'], 'fr': ['BBC Radio Sheffield']}",,[],"{'Q145': ('', '')}",53.3759,-1.4668 +Q17643392,Manor Lodge,['Q811979'],"{'en': ['Manor House', 'Manor Lodge'], 'de': ['Sheffield Manor'], 'nl': ['Sheffield Manor'], 'ga': ['Sheffield Manor'], 'es': ['Sheffield Manor Lodge']}",,['Q23436'],"{'Q145': ('', '')}",53.3739,-1.43717 +Q2306176,Sheffield,['Q15127012'],"{'en': ['Sheffield, Massachusetts', 'Sheffield, MA', 'Sheffield'], 'es': ['Sheffield'], 'it': ['Sheffield'], 'nl': ['Sheffield'], 'fr': ['Sheffield'], 'de': ['Sheffield, MA'], 'pl': ['Sheffield'], 'cy': ['Sheffield, Massachusetts'], 'uk': ['Шеффілд'], 'pt': ['Sheffield'], 'tr': ['Sheffield, Massachusetts']}",,[],"{'Q30': ('', '')}",42.110278,-73.355556 +Q897533,Bramall Lane,"['Q1154710', 'Q45290083', 'Q45290083']","{'fr': ['Bramall Lane'], 'en': ['Bramall Lane'], 'pt': ['Bramall Lane'], 'de': ['Bramall Lane'], 'it': ['Bramall Lane'], 'pl': ['Bramall Lane'], 'nl': ['Bramall Lane'], 'es': ['Bramall Lane'], 'uk': ['Бремолл Лейн'], 'tr': ['Bramall Lane'], 'ro': ['Bramall Lane']}",,['Q163'],"{'Q145': ('', '')}",53.370278,-1.470833 +Q7492570,Sheffield,['Q17343829'],"{'en': ['Sheffield, Texas', 'Sheffield, TX', 'Sheffield'], 'de': ['Sheffield'], 'nl': ['Sheffield, Texas']}",,[],"{'Q30': ('', '')}",30.6906,-101.823 +Q1950928,Sheffield,['Q2154459'],"{'en': ['Sheffield, VT', 'Sheffield'], 'de': ['Sheffield'], 'en-ca': ['Sheffield, Vermont'], 'en-gb': ['Sheffield, Vermont'], 'es': ['Sheffield'], 'fr': ['Sheffield'], 'pl': ['Sheffield'], 'it': ['Sheffield'], 'uk': ['Шеффілд'], 'cy': ['Sheffield, Vermont'], 'tr': ['Sheffield, Vermont']}",,[],"{'Q30': ('', '')}",44.642322,-72.127616 +Q2277715,Sheffield,"['Q3957', 'Q98433835']","{'nl': ['Sheffield, Tasmania', 'Sheffield'], 'en': ['Sheffield, Tasmania', 'Sheffield, Tasmania, Australia', 'Sheffield'], 'de': ['Sheffield (Tasmanien)', 'Sheffield'], 'it': ['Sheffield'], 'fr': ['Sheffield'], 'pl': ['Sheffield (Tasmania)']}",,[],"{'Q408': ('', '')}",-41.382222,146.324722 +Q79568,Sheffield,['Q1093829'],"{'en': ['Sheffield, Alabama', 'Sheffield, AL', 'Sheffield'], 'es': ['Sheffield (Alabama)', 'Sheffield'], 'pt': ['Sheffield'], 'fr': ['Sheffield'], 'uk': ['Шеффілд'], 'it': ['Sheffield'], 'de': ['Sheffield, AL'], 'nl': ['Sheffield'], 'pl': ['Sheffield (Alabama)'], 'tr': ['Sheffield'], 'cy': ['Sheffield, Alabama'], 'ga': ['Sheffield']}",['Sheffield'],[],"{'Q30': ('', '')}",34.759721,-87.694592 +Q518864,Sheffield,['Q751708'],"{'en': ['Sheffield, Illinois', 'Sheffield, IL', 'Sheffield'], 'nl': ['Sheffield'], 'es': ['Sheffield'], 'pt': ['Sheffield'], 'de': ['Sheffield, IL'], 'fr': ['Sheffield'], 'it': ['Sheffield'], 'pl': ['Sheffield (Illinois)'], 'uk': ['Шеффілд'], 'cy': ['Sheffield, Illinois']}",,[],"{'Q30': ('', '')}",41.3558,-89.7367 +Q7492591,Sheffield Blitz,,{'en': ['Sheffield Blitz']},,[],{},53.383333,-1.466667 +Q7492775,Sheffield Township,['Q9035798'],"{'en': ['Sheffield Township, Pennsylvania', 'Township of Sheffield', 'Sheffield Township'], 'es': ['Municipio de Sheffield (condado de Warren, Pensilvania)'], 'de': ['Sheffield Township'], 'uk': ['Шеффілд Тауншип'], 'cy': ['Sheffield Township, Pennsylvania']}",,[],"{'Q30': ('', '')}",41.624722,-78.983056 +Q741640,Wheel of Sheffield,['Q202570'],{'en': ['Wheel of Sheffield']},,[],"{'Q145': ('', '')}",53.381,-1.4699 +Q7492686,Sheffield Interchange,['Q494829'],{'en': ['Sheffield Interchange']},,['Q163'],"{'Q145': ('', '')}",53.3812,-1.46451 +Q3577611,Sheffield Lock,['Q105731'],"{'en': ['Sheffield (or Shenfield) Lock', 'Sheffield Lock At Su 648706', 'Sheffield Lock'], 'fr': ['écluse de Sheffield']}",,['Q67284726'],"{'Q145': ('', '')}",51.4307,-1.06927 +Q12956644,Sheffield,"['Q1002812', 'Q21503295']","{'en': ['City of Sheffield', 'Sheffield'], 'pl': ['City of Sheffield'], 'en-gb': ['Sheffield'], 'fr': ['Sheffield'], 'cy': ['Dinas Sheffield'], 'uk': ['Шеффілд'], 'ga': ['Sheffield']}",,[],"{'Q145': ('', '')}",53.41667,-1.5 +Q547824,HMS Sheffield,"['Q2607934', 'Q852190']","{'pl': ['HMS Sheffield'], 'de': ['HMS Sheffield'], 'en': ['HMS Sheffield'], 'es': ['HMS Sheffield'], 'fr': ['HMS Sheffield'], 'it': ['HMS Sheffield'], 'nl': ['HMS Sheffield'], 'pt': ['HMS Sheffield (D80)'], 'ga': ['HMS Sheffield']}",,[],{},-53.066667,-56.933333 +Q7492719,Sheffield Parish,['Q3252927'],"{'en': ['Sheffield Parish, New Brunswick', 'Sheffield Parish'], 'fr': ['paroisse de Sheffield'], 'uk': ['Шеффілд (парафія, Нью-Брансвік)']}",,[],"{'Q16': ('', '')}",45.9955,-66.2224 +Q7492566,Sheffield,['Q3257686'],"{'en': ['Sheffield, New Zealand', 'Sheffield'], 'nl': ['Sheffield, New Zealand'], 'fr': ['Sheffield (Nouvelle-Zélande)']}",,[],"{'Q664': ('', '')}",-43.388889,172.018056 +Q7492567,Sheffield,['Q56885635'],{'en': ['Sheffield']},,[],"{'Q16': ('', '')}",43.324,-80.201 +Q4523493,Sheffield urban area,['Q702492'],"{'en': ['Sheffield urban area'], 'uk': ['Шеффілд (міська агломерація)']}",,['Q163'],"{'Q145': ('', '')}",53.395,-1.455 +Q3028626,Diocese of Sheffield,['Q18917976'],"{'fr': ['diocèse de Sheffield'], 'en': ['Diocese of Sheffield'], 'pl': ['Diecezja Sheffield'], 'de': ['Diözese von Sheffield'], 'nl': ['Bisdom Sheffield']}",,[],"{'Q145': ('', '')}",53.382,-1.47 +Q7492607,Sheffield city centre,['Q738570'],"{'en': ['Sheffield city centre'], 'ga': ['Sheffield Lár na Cathrach']}",,['Q163'],"{'Q145': ('', '')}",53.3814,-1.4746 +Q3365926,Sheffield,['Q486972'],"{'fr': ['Sheffield (nouveau-brunswick)', 'Sheffield (Nouveau-Brunswick)', 'Sheffield'], 'en': ['Sheffield, New Brunswick', 'Sheffield'], 'nl': ['Sheffield, New Brunswick']}",,[],"{'Q16': ('', '')}",45.883,-66.3 +Q7492568,Sheffield,['Q17343829'],"{'en': ['Sheffield, North Carolina', 'Sheffield, NC', 'Sheffield'], 'es': ['Sheffield (Carolina del Norte)'], 'fr': ['Sheffield']}",,[],"{'Q30': ('', '')}",35.966428,-80.680081 +Q108940076,Sheffield,['Q498162'],"{'en': ['Sheffield (CDP), Vermont', 'Sheffield'], 'ga': ['Sheffield']}",,[],"{'Q30': ('', '')}",44.601944,-72.114167 +Q1184547,Sheffield,['Q498162'],"{'en': ['Sheffield, Pennsylvania', 'Sheffield, PA', 'Sheffield'], 'es': ['Sheffield, Pensilvania', 'Sheffield'], 'pt': ['Sheffield'], 'nl': ['Sheffield'], 'de': ['Sheffield'], 'fr': ['Sheffield'], 'it': ['Sheffield'], 'uk': ['Шеффілд'], 'ga': ['Sheffield']}",,[],"{'Q30': ('', '')}",41.7042,-79.0339 +Q1984238,Sheffield,['Q751708'],"{'en': ['Sheffield, Ohio', 'Sheffield Village', 'Sheffield Village, Ohio', 'Sheffield, OH', 'Sheffield'], 'es': ['Sheffield (Ohio)', 'Sheffield'], 'nl': ['Sheffield'], 'pt': ['Sheffield'], 'de': ['Sheffield'], 'fr': ['Sheffield'], 'it': ['Sheffield'], 'pl': ['Sheffield (Ohio)'], 'uk': ['Шеффілд'], 'cy': ['Sheffield, Ohio']}",,[],"{'Q30': ('', '')}",41.4481,-82.0833 diff --git a/tests/sample_files/resources/wikidata/wikidata_to_mentions_normalized.json b/tests/sample_files/resources/wikidata/wikidata_to_mentions_normalized.json new file mode 100644 index 00000000..29f2ffde --- /dev/null +++ b/tests/sample_files/resources/wikidata/wikidata_to_mentions_normalized.json @@ -0,0 +1,1933 @@ +{ + "Q123885": { + "": 0.0005159958720330237, + "Royal Society": 0.785345717234262, + "London": 0.001547987616099071, + "Royal Society of London": 0.07739938080495355, + "Fellow": 0.0005159958720330237, + "Foreign Member of the Royal Society": 0.0005159958720330237, + "Fellow of the Royal Society": 0.012383900928792569, + "Presidency": 0.0005159958720330237, + "Royal": 0.0010319917440660474, + "FRS": 0.08513931888544891, + "Royal Academy of Sciences": 0.0005159958720330237, + "The Royal Society of London for Improving Natural Knowledge": 0.0020639834881320948, + "FRS(For)": 0.0036119711042311656, + "British Royal Society": 0.001547987616099071, + "Foreign Member of the Royal Society (ForMemRS)": 0.0010319917440660474, + "Royal Society of England": 0.001547987616099071, + "F.R.S.": 0.0020639834881320948, + "Royal Society in London": 0.0005159958720330237, + "Fellowship of the Royal Society": 0.0005159958720330237, + "Royal Societies": 0.0010319917440660474, + "FRS (For)": 0.0005159958720330237, + "The Royal Society": 0.007739938080495355, + "ForMemRS": 0.0020639834881320948, + "FFRS": 0.0010319917440660474, + "Royal Society of Great Britain": 0.0005159958720330237, + "British Academy of Science": 0.0005159958720330237, + "Royal Society of London for Improving Natural Knowledge": 0.0010319917440660474, + "FRS (FOR)": 0.0005159958720330237, + "FRSF": 0.0005159958720330237, + "the Royal Society": 0.0025799793601651187, + "UK Royal Society": 0.0005159958720330237, + "its British counterpart": 0.0005159958720330237, + "Royal Academy of Science": 0.0005159958720330237, + "scientific institution": 0.0005159958720330237, + "The Royal Society of London": 0.0005159958720330237, + "Society of London for Improving Natural Knowledge": 0.0005159958720330237, + "FRSFor": 0.0005159958720330237 + }, + "Q734547": { + "North": 0.008426966292134831, + "north": 0.011235955056179775, + "London": 0.0028089887640449437, + "North London": 0.7780898876404494, + "north London": 0.1797752808988764, + "north west London": 0.0028089887640449437, + "North West London": 0.011235955056179775, + "North London's": 0.0028089887640449437, + "Northern London": 0.0028089887640449437 + }, + "Q1394500": { + "South": 0.007978723404255319, + "southeast": 0.007978723404255319, + "south": 0.007978723404255319, + "south-east": 0.005319148936170213, + "London": 0.0026595744680851063, + "South London": 0.7819148936170213, + "south London": 0.14627659574468085, + "South East": 0.007978723404255319, + "South East London": 0.013297872340425532, + "south east": 0.0026595744680851063, + "south west London": 0.0026595744680851063, + "South West London": 0.005319148936170213, + "south-east London": 0.0026595744680851063, + "South London, England": 0.0026595744680851063, + "southeast London": 0.0026595744680851063 + }, + "Q338466": { + "medieval": 0.03333333333333333, + "London": 0.13333333333333333, + "Saxon": 0.03333333333333333, + "Anglo-Saxon settlement": 0.03333333333333333, + "Lundenwic": 0.5333333333333333, + "Lundenburh": 0.16666666666666666, + "Lundenburgh": 0.03333333333333333, + "Lundenwick": 0.03333333333333333 + }, + "Q927198": { + "Roman": 0.01948051948051948, + "Augusta": 0.006493506493506494, + "Roman times": 0.006493506493506494, + "London": 0.01948051948051948, + "Londinium": 0.8051948051948052, + "settlement by the Romans": 0.006493506493506494, + "Lundonia": 0.006493506493506494, + "Roman London": 0.05844155844155845, + "Londinium's": 0.006493506493506494, + "roughly 1600 years old": 0.006493506493506494, + "Londonium": 0.006493506493506494, + "London citadel": 0.006493506493506494, + "governor's palace": 0.012987012987012988, + "Londinium (London)": 0.006493506493506494, + "\"Londinium\"": 0.006493506493506494, + "Roman city of London": 0.006493506493506494, + "Londinium Augusta": 0.006493506493506494, + "Caer Lundein": 0.006493506493506494 + }, + "Q8111": { + "Olympic Games": 0.011029411764705881, + "London": 0.007352941176470588, + "1908": 0.11029411764705882, + "1908 Olympics in London": 0.003676470588235294, + "1908 Olympics": 0.04044117647058824, + "1908 Summer Olympics": 0.6617647058823529, + "London Olympic Games": 0.01838235294117647, + "London in 1908": 0.003676470588235294, + "1908 London Games": 0.007352941176470588, + "1908 Games": 0.011029411764705881, + "1908 London Olympics": 0.029411764705882353, + "1908 Olympic Games": 0.044117647058823525, + "London Olympics": 0.007352941176470588, + "1908 Summer Olympic Games": 0.007352941176470588, + "Olympic Marathon": 0.003676470588235294, + "Rome 1908": 0.003676470588235294, + "the 1908 Games": 0.003676470588235294, + "London 1908": 0.003676470588235294, + "London Olympics in 1908": 0.003676470588235294, + "London (1908)": 0.003676470588235294, + "Olympics in 1908": 0.003676470588235294, + "London games in 1908": 0.003676470588235294, + "1908 games": 0.003676470588235294, + "summer Olympics held in London in July 1908": 0.003676470588235294 + }, + "Q8577": { + "Olympic Games": 0.00495458298926507, + "2008": 0.0008257638315441783, + "2012": 0.03550784475639967, + "Olympic": 0.007431874483897605, + "Summer Olympics": 0.005780346820809248, + "London": 0.00495458298926507, + "Olympics": 0.0016515276630883566, + "games": 0.0008257638315441783, + "London Olympics 2012": 0.005780346820809248, + "2012 Summer Olympics": 0.4921552436003303, + "2012 London Olympics": 0.09578860445912468, + "2012 Summer Olympic Games": 0.00495458298926507, + "2012 Olympics": 0.061932287365813375, + "London 2012 Olympic Games": 0.021469859620148635, + "London in 2012": 0.0008257638315441783, + "London 2012 Olympics": 0.02972749793559042, + "Games": 0.0008257638315441783, + "2012 Olympic Games": 0.06028075970272502, + "London 2012 Summer Olympics": 0.010734929810074317, + "2012 Olympic Park": 0.0008257638315441783, + "Olympics Games": 0.0008257638315441783, + "London 2012 Summer Olympic Committee": 0.0008257638315441783, + "London Summer Olympics": 0.002477291494632535, + "London 2012 Olympic and Paralympic Games": 0.004128819157720892, + "In 2012": 0.0008257638315441783, + "London 2012": 0.03881090008257638, + "London Olympic Games": 0.004128819157720892, + "2012 London Games": 0.0016515276630883566, + "2012 London Summer Olympics": 0.00495458298926507, + "July 2012": 0.0008257638315441783, + "Summer Olympics in London": 0.0008257638315441783, + "2012 Olympic": 0.012386457473162676, + "London Olympics in 2012": 0.0033030553261767133, + "2012 games": 0.0016515276630883566, + "London Olympics": 0.020644095788604457, + "XXX Olympiad in London": 0.0008257638315441783, + "2012 Games": 0.00495458298926507, + "2012 London Olympic Games": 0.00990916597853014, + "2012 Paralympic Games": 0.0008257638315441783, + "Olympics in London": 0.0016515276630883566, + "Summer Olympics 2012": 0.0008257638315441783, + "London Olympic Games 2012": 0.0008257638315441783, + "London 2012 Olympic": 0.0008257638315441783, + "2012 Olympic gold medalist": 0.0008257638315441783, + "London, 2012": 0.0008257638315441783, + "2012 London Olympics opening ceremony": 0.0008257638315441783, + "2012 Summer Olympics logo": 0.0008257638315441783, + "XXX Olympic Games": 0.0008257638315441783, + "2012 London Olympics and Paralympics": 0.0008257638315441783, + "London 2012 Games": 0.002477291494632535, + "2012 Summer Olympic": 0.0008257638315441783, + "Olympic Games in London": 0.0008257638315441783, + "2012 Summer Olympics in London": 0.0008257638315441783, + "2012 Olympian": 0.0016515276630883566, + "Olympic Games of 2012": 0.0008257638315441783, + "the warm-up": 0.0008257638315441783, + "2012 in London": 0.0008257638315441783, + "London Games of 2012": 0.0008257638315441783, + "London for 2012": 0.0008257638315441783, + "Summer Olympics of 2012": 0.0008257638315441783, + "2012 London": 0.002477291494632535, + "upcoming Summer Olympics": 0.0008257638315441783, + "2012 Summer Games": 0.002477291494632535, + "the previous Summer Olympic Games": 0.0008257638315441783, + "the 2012 London Olympics": 0.0008257638315441783, + "2012 Olympic games": 0.0008257638315441783, + "Olympic Games in 2012": 0.0008257638315441783, + "2012 Olympic and Paralympic Games": 0.0008257638315441783, + "Olympics in 2012": 0.0008257638315441783, + "2012 London Olympic games": 0.0008257638315441783, + "Olympics and Paralympics": 0.0008257638315441783, + "London Ambassadors for the Olympic and Paralympic Games": 0.0008257638315441783, + "London's 2012": 0.0008257638315441783, + "London 2012 Olympic Games.": 0.0008257638315441783, + "2012 Olympians": 0.0008257638315441783, + "Olympic Games 2012": 0.0008257638315441783, + "Olympics 2012": 0.0008257638315441783 + }, + "Q194209": { + "2012": 0.26666666666666666, + "London": 0.06666666666666667, + "2012 Summer Olympics": 0.26666666666666666, + "2012 Olympics": 0.26666666666666666, + "2012 London Summer Olympics": 0.06666666666666667, + "2012 Summer Olympics in London": 0.06666666666666667 + }, + "Q8699": { + "L": 0.013888888888888888, + "Leeds": 0.013888888888888888, + "LBA": 0.013888888888888888, + "Leeds Bradford": 0.2638888888888889, + "Leeds Bradford Airport": 0.41666666666666663, + "Yeadon": 0.013888888888888888, + "Leeds Bradford International Airport": 0.15277777777777776, + "eeds\u2013Bradford (Airport)": 0.013888888888888888, + "Leeds/Bradford": 0.05555555555555555, + "Leeds-Bradford": 0.027777777777777776, + "Yeadon Aerodrome": 0.013888888888888888 + }, + "Q55018": { + "Royal Opera House": 0.5739348370927317, + "London": 0.0012531328320802004, + "Covent Garden": 0.24436090225563908, + "Theatre Royal": 0.0037593984962406013, + "English National Opera": 0.0012531328320802004, + "Royal Opera": 0.005012531328320802, + "Covent Garden Theatre": 0.05388471177944862, + "Linbury Theatre": 0.002506265664160401, + "Royal Italian Opera, Covent Garden": 0.0012531328320802004, + "Royal Opera House, Covent Garden": 0.03759398496240601, + "Covent Garden Opera House": 0.006265664160401002, + "Covent Garden Opera": 0.002506265664160401, + "Theatre Royal, Covent Garden": 0.03007518796992481, + "Royal Opera House Covent Garden": 0.0037593984962406013, + "the Covent Garden theatre": 0.0012531328320802004, + "ROH": 0.0012531328320802004, + "the Opera": 0.0012531328320802004, + "English Opera House": 0.0012531328320802004, + "Linbury Studio Theatre": 0.006265664160401002, + "Covent Gardens": 0.0012531328320802004, + "The Royal Opera House": 0.006265664160401002, + "the Covent Garden Theatre": 0.0012531328320802004, + "the Theatre Royal, Covent Garden": 0.0012531328320802004, + "Covent Garden opera": 0.0012531328320802004, + "the Royal Opera Company": 0.0012531328320802004, + "Royal Italian Opera": 0.0037593984962406013, + "Orchestra of the Royal Opera House": 0.0012531328320802004, + "Covent Garden Company": 0.0012531328320802004, + "Royal Italian Opera House, Covent Garden": 0.002506265664160401 + }, + "Q2354215": { + "central": 0.006211180124223603, + "Central": 0.008281573498964804, + "London": 0.004140786749482402, + "city centre": 0.002070393374741201, + "central London": 0.4078674948240166, + "Central London": 0.5610766045548654, + "central area": 0.004140786749482402, + "central area of London": 0.002070393374741201, + "centre of London": 0.002070393374741201, + "Central London's": 0.002070393374741201 + }, + "Q1988417": { + "Chinatown": 0.4878048780487805, + "London": 0.0975609756097561, + "London's Chinatown": 0.17073170731707318, + "Gerrard Street": 0.024390243902439025, + "Chinatown of London": 0.024390243902439025, + "China Town": 0.024390243902439025, + "Chinatown, London": 0.07317073170731708, + "Chinatown in London": 0.024390243902439025, + "London Chinatown": 0.07317073170731708 + }, + "Q951830": { + "mint": 0.016736401673640166, + "Royal Mint": 0.8661087866108785, + "London": 0.0041841004184100415, + "Mint": 0.0041841004184100415, + "mints": 0.008368200836820083, + "the Royal Mint": 0.0041841004184100415, + "London Mint": 0.0041841004184100415, + "Royal Mint's": 0.008368200836820083, + "royal mint": 0.012552301255230124, + "London mint": 0.0041841004184100415, + "the Mint": 0.008368200836820083, + "Tower Mint": 0.02092050209205021, + "Tower mint": 0.0041841004184100415, + "British Royal Mint": 0.0041841004184100415, + "The Royal Mint Experience": 0.0041841004184100415, + "seized the money held in trust at the mint of the Exchequer": 0.0041841004184100415, + "Royal mint": 0.008368200836820083, + "English Mint": 0.0041841004184100415, + "The Royal Mint": 0.008368200836820083 + }, + "Q84": { + "Augusta": 6.508721687060662e-05, + "Bavaria": 6.508721687060662e-05, + "London": 0.9822311897943244, + "North London": 0.00019526165061181986, + "metropolis": 6.508721687060662e-05, + "Londinium": 0.00013017443374121324, + "London, England": 0.011845873470450404, + "Greater London": 0.0003905233012236397, + "East London": 6.508721687060662e-05, + "West London": 0.0002603488674824265, + "London's": 0.0009763082530590992, + "Lundenwic": 6.508721687060662e-05, + "Lundenburh": 6.508721687060662e-05, + "south London": 6.508721687060662e-05, + "Romford": 6.508721687060662e-05, + "Chelsea, London": 6.508721687060662e-05, + "north London": 6.508721687060662e-05, + "London Town": 6.508721687060662e-05, + "hometown": 6.508721687060662e-05, + "London, United Kingdom": 0.0009112210361884927, + "the capital city": 6.508721687060662e-05, + "South East London": 6.508721687060662e-05, + "Lond.": 6.508721687060662e-05, + "Lon\"don\"": 6.508721687060662e-05, + "the metropolis": 6.508721687060662e-05, + "Lunden": 6.508721687060662e-05, + "west London": 6.508721687060662e-05, + "North West London": 0.00013017443374121324, + "South West London": 6.508721687060662e-05, + "London, UK": 0.0006508721687060662, + "Lundenburgh": 6.508721687060662e-05, + "London in January 1963": 6.508721687060662e-05, + "the British capital": 6.508721687060662e-05, + "Londoner": 6.508721687060662e-05, + "London\u2019s": 6.508721687060662e-05, + "court of St James's": 6.508721687060662e-05, + "Londres": 0.0002603488674824265, + "Londra": 0.0002603488674824265, + "big city": 6.508721687060662e-05 + }, + "Q8703": { + "of the same name": 0.0023584905660377358, + "London": 0.0047169811320754715, + "Gatwick Airport": 0.46462264150943394, + "London Gatwick": 0.1179245283018868, + "London Gatwick Airport": 0.1509433962264151, + "Gatwick": 0.17452830188679244, + "London-Gatwick": 0.03773584905660377, + "London\u2013Gatwick": 0.0330188679245283, + "Gatwick airport": 0.0047169811320754715, + "Airport Shuttle people-mover": 0.0023584905660377358, + "Gatwick, London": 0.0023584905660377358, + "London Gatwick Airport's": 0.0023584905660377358, + "Gatwick Airport shuttle system": 0.0023584905660377358 + }, + "Q2716505": { + "Stamford Hill": 0.9821428571428571, + "London": 0.017857142857142856 + }, + "Q123738": { + "Hyde Park": 0.8356890459363957, + "London": 0.0017667844522968198, + "Hyde": 0.0017667844522968198, + "Hyde Park, London": 0.1413427561837456, + "Hyde Park Gate": 0.0017667844522968198, + "London's Hyde Park": 0.007067137809187279, + "BST Hyde Park festival": 0.0017667844522968198, + "British Summer Time: Hyde Park festival": 0.0017667844522968198, + "the park": 0.0017667844522968198, + "Hyde-Park": 0.0017667844522968198, + "Stanhope Lodge": 0.0017667844522968198, + "hyde park": 0.0017667844522968198 + }, + "Q205679": { + "Hoxton": 0.0036900369003690036, + "London": 0.0036900369003690036, + "Stoke Newington": 0.0036900369003690036, + "London Borough of Hackney": 0.39114391143911437, + "Hackney": 0.5571955719557196, + "Haggerston": 0.0036900369003690036, + "Hackney Wick": 0.0036900369003690036, + "De Beauvoir Town": 0.0036900369003690036, + "wider modern borough": 0.0036900369003690036, + "modern borough of Hackney": 0.0036900369003690036, + "Upper Clapton": 0.0036900369003690036, + "Hackney, East London": 0.01107011070110701, + "Borough of Hackney": 0.0036900369003690036, + "that of Hackney": 0.0036900369003690036 + }, + "Q20075": { + "London Underground": 0.885578876100203, + "London": 0.004062288422477996, + "lines": 0.0006770480704129993, + "tube": 0.017603249830737983, + "underground": 0.008124576844955992, + "Underground": 0.027758970886932972, + "London's": 0.0006770480704129993, + "Tube": 0.011509817197020988, + "the Underground": 0.002708192281651997, + "LU": 0.0006770480704129993, + "underground railway": 0.0006770480704129993, + "until 1863": 0.0006770480704129993, + "London underground": 0.002031144211238998, + "tube station": 0.0033852403520649964, + "subway train": 0.0006770480704129993, + "London Underground's": 0.002031144211238998, + "tube stations": 0.004062288422477996, + "Underground stations": 0.0006770480704129993, + "deep-level tube network": 0.0006770480704129993, + "Transport for London roundel": 0.0006770480704129993, + "Underground roundel": 0.0033852403520649964, + "underground shelters": 0.0006770480704129993, + "tube trains": 0.0006770480704129993, + "London's underground rail network": 0.0006770480704129993, + "Tube roundel": 0.0006770480704129993, + "London Underground PPP": 0.0006770480704129993, + "tube carriages": 0.0006770480704129993, + "London Tube": 0.002031144211238998, + "the tube railways in London": 0.0006770480704129993, + "London's Underground": 0.0013540961408259986, + "London's underground railway system": 0.0006770480704129993, + "London Tube station": 0.0006770480704129993, + "tube roundel": 0.0006770480704129993, + "the Tube": 0.004062288422477996, + "underground train": 0.0006770480704129993, + "London tube railways": 0.0006770480704129993, + "tube-train": 0.0006770480704129993, + "sitting on the tube": 0.0006770480704129993, + "distinctive red-and-blue roundel": 0.0006770480704129993, + "system in London": 0.0006770480704129993, + "Tube Station": 0.0006770480704129993, + "Londons Underground subway system": 0.0006770480704129993, + "LUL": 0.0006770480704129993, + "London tube": 0.0006770480704129993, + "tube train": 0.0006770480704129993 + }, + "Q160302": { + "Edinburgh": 0.06723198061780739, + "University of Edinburgh": 0.789824348879467, + "university": 0.0018170805572380374, + "Edinburgh University": 0.12235009085402786, + "University": 0.0024227740763173833, + "Edinburgh.": 0.0006056935190793458, + "Edinburg": 0.0006056935190793458, + "Moray House College": 0.0012113870381586917, + "The University of Edinburgh": 0.0060569351907934586, + "College of Edinburgh": 0.0006056935190793458, + "University of Edinburgh's": 0.0012113870381586917, + "university of Edinburgh": 0.0024227740763173833, + "Edinburgh University Savoy Opera Group": 0.0006056935190793458, + "University of Edinburg": 0.0006056935190793458, + "Easter Bush": 0.0006056935190793458, + "Edinburgh University (1760-62)": 0.0006056935190793458, + "the University of Edinburgh": 0.0012113870381586917 + }, + "Q2422792": { + "metropolitan area": 0.012987012987012988, + "London": 0.025974025974025976, + "commuter town": 0.012987012987012988, + "commuting": 0.012987012987012988, + "commuter belt": 0.012987012987012988, + "greater metropolitan": 0.012987012987012988, + "London commuter belt": 0.5844155844155845, + "commutes": 0.025974025974025976, + "City of London commuters": 0.012987012987012988, + "London metropolitan area": 0.025974025974025976, + "stockbroker belt": 0.012987012987012988, + "Commuter Belt": 0.03896103896103896, + "London Commuter Belt": 0.12987012987012989, + "London Metropolitan Area": 0.025974025974025976, + "housing with gardens forming suburbs to London": 0.012987012987012988, + "commuter and retirement town": 0.012987012987012988, + "London commuters": 0.012987012987012988, + "London metro area": 0.012987012987012988 + }, + "Q171240": { + "stock exchange": 0.001466275659824047, + "LSE": 0.00439882697947214, + "London": 0.01906158357771261, + "London Stock Exchange": 0.9413489736070382, + "stock market": 0.001466275659824047, + "Stock Exchange": 0.01466275659824047, + "London financial markets": 0.001466275659824047, + "London stock exchange": 0.00879765395894428, + "stock-exchange": 0.001466275659824047, + "London Stock Exchange's": 0.001466275659824047, + "Stock Market": 0.001466275659824047, + "London exchange": 0.001466275659824047, + "stock exchange listing in London": 0.001466275659824047 + }, + "Q743535": { + "Chelsea": 0.7292035398230088, + "London": 0.0017699115044247787, + "Chelsea, London": 0.26194690265486725, + "the area in London, England": 0.0017699115044247787, + "Chelsea, England": 0.0035398230088495575, + "Chelsea Old Town Hall": 0.0017699115044247787 + }, + "Q193196": { + "University College London": 0.7849025974025975, + "London": 0.0008116883116883117, + "University of London": 0.0016233766233766235, + "London University": 0.007305194805194806, + "University College": 0.029220779220779224, + "school": 0.0008116883116883117, + "UCL": 0.05113636363636364, + "University College, London": 0.10633116883116883, + "University College London's Special Collections": 0.0008116883116883117, + "11 constituent faculties": 0.0008116883116883117, + "UCL Press": 0.0016233766233766235, + "Center for European Studies": 0.0008116883116883117, + "Grant Museum of Zoology": 0.0008116883116883117, + "University College of London": 0.006493506493506494, + "UCL Art Museum": 0.0008116883116883117, + "University College London's": 0.0008116883116883117, + "London University College": 0.0016233766233766235, + "UCL East": 0.0008116883116883117, + "UCL Centre for Decision Making Uncertainty": 0.0008116883116883117, + "University College London (UCL)": 0.0008116883116883117, + "University College, Gower Street": 0.0008116883116883117 + }, + "Q795691": { + "Waterloo": 0.18468468468468469, + "London": 0.0045045045045045045, + "Waterloo Bridge": 0.0022522522522522522, + "railway station": 0.0022522522522522522, + "London Waterloo": 0.4954954954954955, + "London Victoria": 0.0022522522522522522, + "Waterloo Station": 0.02702702702702703, + "Waterloo station": 0.1554054054054054, + "London Waterloo station": 0.08783783783783784, + "Waterloo main line station": 0.0022522522522522522, + "Waterloo National Rail station": 0.0022522522522522522, + "London Waterloo railway station": 0.013513513513513514, + "Waterloo main-line station": 0.0022522522522522522, + "Waterloo railway station": 0.0045045045045045045, + "Waterloo Bridge station": 0.009009009009009009, + "retail balcony": 0.0022522522522522522, + "Waterloo Bridge Station": 0.0022522522522522522 + }, + "Q148349": { + "Waterloo": 0.01532567049808429, + "London": 0.0038314176245210726, + "Lambeth": 0.9195402298850575, + "Stockwell Infants School": 0.0038314176245210726, + "Stangate": 0.007662835249042145, + "Lambeth North": 0.0038314176245210726, + "South Lambeth": 0.026819923371647507, + "Stockwell Primary School": 0.0038314176245210726, + "Lambeth Walk": 0.0038314176245210726, + "North Lambeth": 0.007662835249042145, + "north Lambeth": 0.0038314176245210726 + }, + "Q1359589": { + "West End": 0.8505678421996413, + "London": 0.004781829049611476, + "West End of London": 0.002390914524805738, + "West End theatre": 0.07053197848176927, + "London's West End": 0.01852958756724447, + "London West End": 0.0029886431560071725, + "theatre district": 0.0005977286312014345, + "West End Theatre": 0.003586371787208607, + "theatre area": 0.0005977286312014345, + "Theatre District": 0.0005977286312014345, + "London stage": 0.0041841004184100415, + "Theatreland": 0.0053795576808129105, + "West End productions": 0.001195457262402869, + "the West End": 0.002390914524805738, + "West End shows": 0.0017931858936043035, + "The West End": 0.0005977286312014345, + "West End drama": 0.0005977286312014345, + "London's theatrical world": 0.0005977286312014345, + "West End theater": 0.001195457262402869, + "West End stage": 0.0053795576808129105, + "West End theatre district": 0.001195457262402869, + "West End theatre company.": 0.0005977286312014345, + "West End revival": 0.0029886431560071725, + "West End musical": 0.002390914524805738, + "West End theatreland": 0.0005977286312014345, + "London's theatre district": 0.0005977286312014345, + "West End production": 0.002390914524805738, + "List of West End theatres": 0.0005977286312014345, + "West End musicals": 0.001195457262402869, + "West End London": 0.0005977286312014345, + "West End's": 0.0005977286312014345, + "London productions": 0.0005977286312014345, + "West-end stage": 0.0005977286312014345, + "West End hit": 0.0005977286312014345, + "London theatre": 0.001195457262402869, + "London Theatre": 0.0005977286312014345, + "West End.": 0.0005977286312014345, + "non-commercial West End theatre": 0.0005977286312014345, + "commercial theatre": 0.0005977286312014345, + "London theatres": 0.0005977286312014345, + "west-end": 0.0005977286312014345, + "theatres in the West End of London": 0.0005977286312014345, + "London stage production": 0.0005977286312014345 + }, + "Q42182": { + "Buckingham Palace": 0.9712820512820513, + "London": 0.0010256410256410256, + "Buckingham": 0.0010256410256410256, + "Buckingham House": 0.021538461538461538, + "Arlington House": 0.0010256410256410256, + "the Palace": 0.0010256410256410256, + "Goring House": 0.0010256410256410256, + "Buckingham house": 0.0010256410256410256, + "Royal household": 0.0010256410256410256 + }, + "Q1466941": { + "York": 0.006060606060606061, + "Leeds": 0.8484848484848485, + "Leeds City": 0.01818181818181818, + "Leeds railway station": 0.07878787878787878, + "Leeds Wellington": 0.006060606060606061, + "Wellington Station": 0.006060606060606061, + "Leeds Station": 0.006060606060606061, + "Leeds Wellington station": 0.006060606060606061, + "Leeds Wellington Station": 0.006060606060606061, + "Leeds station": 0.012121212121212121, + "Leeds First": 0.006060606060606061 + }, + "Q6900329": { + "bombing": 0.003605769230769231, + "London": 0.007211538461538462, + "East End": 0.001201923076923077, + "the Blitz": 0.4338942307692308, + "bombed": 0.003605769230769231, + "air raid": 0.004807692307692308, + "1940": 0.001201923076923077, + "London Blitz": 0.09375, + "air raids": 0.007211538461538462, + "The Blitz": 0.17908653846153846, + "Blitz": 0.12740384615384617, + "heavily bombed": 0.001201923076923077, + "aerial bombardment": 0.003605769230769231, + "bombs": 0.001201923076923077, + "German bombing": 0.00841346153846154, + "German bombings": 0.002403846153846154, + "bombing raid": 0.002403846153846154, + "German Luftwaffe bombing": 0.001201923076923077, + "blitz": 0.010817307692307692, + "London blitz": 0.014423076923076924, + "the subsequent Blitz": 0.001201923076923077, + "World War Two": 0.001201923076923077, + "bombing raids": 0.004807692307692308, + "nightly air raids": 0.003605769230769231, + "bomb damage": 0.001201923076923077, + "bombing London": 0.001201923076923077, + "bombings": 0.001201923076923077, + "blitzed": 0.002403846153846154, + "bombing of London": 0.007211538461538462, + "widespread destruction of its infrastructure": 0.001201923076923077, + "Nazi bombings of London": 0.001201923076923077, + "Operation Loge": 0.001201923076923077, + "bombing of the city": 0.001201923076923077, + "the 1941 Blitz": 0.001201923076923077, + "bombed during World War II": 0.001201923076923077, + "heavy aerial bombing": 0.001201923076923077, + "The aerial bombing of World War II": 0.001201923076923077, + "World War II bomb damage": 0.002403846153846154, + "German bombing campaign": 0.001201923076923077, + "bombing of the City of London": 0.001201923076923077, + "Blitz of 1940\u201341": 0.001201923076923077, + "wartime London": 0.001201923076923077, + "German bombing campaign of 1940\u201341": 0.001201923076923077, + "night bombing": 0.001201923076923077, + "1941 London Blitz": 0.001201923076923077, + "the German bombing campaign": 0.001201923076923077, + "the German bombing campaign over England during World War II": 0.001201923076923077, + "bomb damage during the Second World War": 0.001201923076923077, + "the Blitz of London": 0.001201923076923077, + "London to be razed": 0.001201923076923077, + "World War II bombing raid": 0.001201923076923077, + "bombing raids across London": 0.001201923076923077, + "Nazi bombing of British cities": 0.001201923076923077, + "1940s Blitz": 0.001201923076923077, + "Nazi German air raids over London": 0.001201923076923077, + "German \"Blitz\"": 0.001201923076923077, + "heavily": 0.001201923076923077, + "wartime bomb damage": 0.001201923076923077, + "German incendiary bombs": 0.002403846153846154, + "firebombing in the Second World War": 0.001201923076923077, + "night-time bombing of Britain": 0.001201923076923077, + "Bombing of London": 0.001201923076923077, + "attacking the docks and warehouses": 0.001201923076923077, + "German campaign of bombings": 0.001201923076923077, + "German bombs": 0.001201923076923077, + "the London Blitz": 0.002403846153846154, + "World War II bombing": 0.001201923076923077, + "was bomb-struck": 0.001201923076923077, + "aerial bombardment of London": 0.001201923076923077, + "German bombing of British cities": 0.001201923076923077, + "bombing British cities": 0.001201923076923077, + "German bomb": 0.001201923076923077, + "German bombing raid": 0.001201923076923077, + "German Blitz bombing of London": 0.001201923076923077, + "bombings on London": 0.001201923076923077, + "bombardment of 29th-30th December 1940": 0.001201923076923077, + "damaged by a bomb in 1940": 0.001201923076923077, + "aerial bombing of London": 0.001201923076923077, + "raid over Britain": 0.001201923076923077, + "Second World War bomb damage": 0.001201923076923077, + "bound for England": 0.001201923076923077, + "London bombing": 0.001201923076923077 + }, + "Q985210": { + "Victoria": 0.21700879765395895, + "London": 0.01759530791788856, + "London Victoria station": 0.07624633431085044, + "London Victoria": 0.44868035190615835, + "Victoria Station": 0.11143695014662756, + "London Victoria railway station": 0.002932551319648094, + "Victoria, London": 0.002932551319648094, + "Victoria station": 0.08797653958944282, + "Victoria railway station": 0.011730205278592375, + "London (Victoria)": 0.002932551319648094, + "London Victoria Station": 0.011730205278592375, + "Victoria tube station": 0.00879765395894428 + }, + "Q279459": { + "Victoria": 0.047619047619047616, + "London": 0.07936507936507936, + "Victoria Coach Station": 0.6825396825396826, + "London Victoria": 0.15873015873015872, + "London (Victoria)": 0.015873015873015872, + "adjacent national coach station": 0.015873015873015872 + }, + "Q23311": { + "the City": 0.027230590961761298, + "London": 0.033603707995365, + "the city": 0.0005793742757821553, + "city": 0.0034762456546929316, + "City": 0.06836616454229433, + "City of London": 0.839513325608343, + "London, England": 0.0005793742757821553, + "London City": 0.0005793742757821553, + "The City": 0.005793742757821553, + "The City of London": 0.0017381228273464658, + "London city fathers": 0.0005793742757821553, + "trading heart of the capital": 0.0005793742757821553, + "city of London": 0.006373117033603708, + "the City of London": 0.002317497103128621, + "City Bars": 0.0005793742757821553, + "city workers": 0.0011587485515643105, + "St Pauls": 0.0005793742757821553, + "old city center": 0.0011587485515643105, + "Square Mile": 0.0028968713789107765, + "London's financial district": 0.0005793742757821553, + "The Square Mile": 0.0005793742757821553, + "historic City boundaries": 0.0005793742757821553, + "London EC3": 0.0005793742757821553 + }, + "Q130206": { + "bridge": 0.01090909090909091, + "London": 0.007272727272727273, + "London Bridge": 0.9272727272727272, + "Roman bridge": 0.0036363636363636364, + "New London Bridge": 0.007272727272727273, + "the rebuilding of London Bridge": 0.007272727272727273, + "its namesake": 0.0036363636363636364, + "Old London Bridge": 0.01090909090909091, + "London-bridge": 0.0036363636363636364, + "the Rennie London Bridge": 0.0036363636363636364, + "new London Bridge": 0.0036363636363636364, + "London Bridge of 1209 to 1831": 0.0036363636363636364, + "Loddon Bridge": 0.0036363636363636364, + "1831 London Bridge": 0.0036363636363636364 + }, + "Q5338273": { + "Edinburgh University": 0.75, + "Edinburgh University Ladies": 0.25 + }, + "Q170027": { + "London": 0.04149026248941575, + "University of London": 0.8882303132938187, + "London University": 0.058425063505503805, + "Chancellor": 0.000846740050804403, + "University College, London": 0.001693480101608806, + "Lond.": 0.000846740050804403, + "University of London intercollegiate halls of residence": 0.000846740050804403, + "the University of London": 0.000846740050804403, + "Athlone Press": 0.000846740050804403, + "Chancellor of the University of London": 0.000846740050804403, + "Universities of London": 0.001693480101608806, + "The University of London": 0.000846740050804403, + "many constituent colleges": 0.000846740050804403, + "London University School": 0.000846740050804403, + "'schemes of special relations'": 0.000846740050804403 + }, + "Q800751": { + "London": 0.042105263157894736, + "Euston station": 0.07105263157894737, + "London Euston": 0.5842105263157895, + "Euston": 0.1631578947368421, + "Euston railway station": 0.07105263157894737, + "London terminus": 0.002631578947368421, + "Euston Station": 0.049999999999999996, + "Euston main line station": 0.002631578947368421, + "London Euston station": 0.005263157894736842, + "(Euston": 0.002631578947368421, + "EUS": 0.002631578947368421, + "Euston Railway Station": 0.002631578947368421 + }, + "Q214788": { + "London": 0.02122641509433962, + "Paddington": 0.16981132075471697, + "London Paddington station": 0.07075471698113207, + "Paddington Station": 0.06132075471698113, + "London Paddington": 0.5660377358490566, + "Paddington station": 0.08962264150943396, + "Paddington rail station": 0.0023584905660377358, + "Paddington railway station": 0.011792452830188678, + "London_Paddington": 0.0023584905660377358, + "London Paddington railway station": 0.0023584905660377358, + "London (Paddington)": 0.0023584905660377358 + }, + "Q1488404": { + "London": 0.058823529411764705, + "docks": 0.029411764705882353, + "London Docks": 0.8235294117647058, + "Docks": 0.029411764705882353, + "Hermitage entrance": 0.029411764705882353, + "London docks": 0.029411764705882353 + }, + "Q14946379": { + "London": 0.18518518518518517, + "Diocese of London": 0.5462962962962963, + "diocese of London": 0.12962962962962962, + "See of London": 0.046296296296296294, + "Bishops of London": 0.009259259259259259, + "see of London": 0.07407407407407407, + "London Diocesan House": 0.009259259259259259 + }, + "Q92561": { + "London": 0.3769633507853403, + "City of London": 0.003926701570680629, + "London, Ontario": 0.6020942408376964, + "London's": 0.0013089005235602095, + "Fanshawe": 0.0013089005235602095, + "London, Canada West": 0.002617801047120419, + "London (Ontario)": 0.002617801047120419, + "London, Ontario, Canada": 0.0013089005235602095, + "Eager Beaver Baseball Association": 0.002617801047120419, + "London Police Service": 0.0013089005235602095, + "Hubrey": 0.0013089005235602095, + "City of London, Ontario": 0.002617801047120419 + }, + "Q2477346": { + "London": 0.8461538461538463, + "Londres": 0.15384615384615385 + }, + "Q3061911": { + "London": 0.6923076923076923, + "London, Kentucky": 0.2564102564102564, + "London, KY": 0.05128205128205128 + }, + "Q1137312": { + "London": 0.058252427184466014, + "County of London": 0.9174757281553397, + "Inner London": 0.0048543689320388345, + "county of London": 0.019417475728155338 + }, + "Q6670323": { + "London": 0.2222222222222222, + "London District": 0.7777777777777777 + }, + "Q8691": { + "London": 0.026713124274099883, + "airport": 0.0011614401858304297, + "Heathrow Airport": 0.4796747967479675, + "London Heathrow": 0.10569105691056911, + "London Heathrow Airport": 0.15098722415795587, + "London, England": 0.0011614401858304297, + "London Airport": 0.010452961672473868, + "Heathrow": 0.16492450638792103, + "Heathrow International Airport": 0.0011614401858304297, + "London-Heathrow": 0.013937282229965157, + "London's Heathrow Airport": 0.0023228803716608595, + "London Airport (Heathrow)": 0.0011614401858304297, + "Heathrow airport": 0.011614401858304297, + "London\u2013Heathrow": 0.023228803716608595, + "Heathrow Airport Terminal 5": 0.0011614401858304297, + "LHR": 0.0011614401858304297, + "London Heathrow Airport (LHR)": 0.0011614401858304297, + "Philip Sherwood": 0.0011614401858304297, + "London Heathrow airport": 0.0011614401858304297 + }, + "Q1545354": { + "London": 0.18181818181818182, + "docks": 0.010101010101010102, + "Port of London": 0.7272727272727273, + "tidal part of the Thames": 0.010101010101010102, + "Port of London Authority": 0.010101010101010102, + "port of London": 0.020202020202020204, + "London docks": 0.010101010101010102, + "London's historic docks": 0.010101010101010102, + "enclosed dock system": 0.010101010101010102, + "Control Centre": 0.010101010101010102 + }, + "Q578794": { + "London": 0.03870967741935484, + "Virgin London Marathon": 0.01935483870967742, + "London Marathon": 0.9225806451612903, + "London Marathon Charitable Trust": 0.0064516129032258064, + "the London Marathon": 0.0064516129032258064, + "London marathon": 0.0064516129032258064 + }, + "Q1415441": { + "London": 0.014285714285714285, + "Southend": 0.22857142857142856, + "London Southend Airport": 0.2857142857142857, + "Rochford": 0.02857142857142857, + "Southend Airport": 0.22857142857142856, + "Southend Municipal Airport": 0.014285714285714285, + "Southend (Rochford)": 0.014285714285714285, + "London Southend": 0.05714285714285714, + "Southend Airport (Rochford)": 0.014285714285714285, + "RAF Rochford": 0.05714285714285714, + "London South": 0.014285714285714285, + "nd": 0.014285714285714285, + "London-Southend": 0.014285714285714285, + "London-Southend Airport": 0.014285714285714285 + }, + "Q6669759": { + "London": 1.0 + }, + "Q219867": { + "London": 0.025423728813559324, + "King's Cross": 0.2909604519774011, + "King's Cross station": 0.09322033898305085, + "Kings Cross": 0.0423728813559322, + "London King's Cross": 0.3559322033898305, + "King's Cross Station": 0.03389830508474576, + "London King's Cross railway station": 0.03954802259887005, + "King's Cross, London": 0.002824858757062147, + "Kings Cross railway station": 0.002824858757062147, + "King's Cross railway station": 0.0423728813559322, + "Kings Cross Station": 0.002824858757062147, + "London King's Cross station": 0.011299435028248588, + "London Kings Cross Railway Station": 0.002824858757062147, + "London Kings Cross": 0.031073446327683614, + "London Kings Cross station": 0.002824858757062147, + "King's Cross mainline station": 0.005649717514124294, + "King's Cross York Road": 0.002824858757062147, + "King's Cross station, London": 0.002824858757062147, + "King's Cross terminal": 0.002824858757062147, + "King\u2019s Cross station": 0.005649717514124294 + }, + "Q795678": { + "London": 0.03333333333333333, + "London Waterloo": 0.1, + "Waterloo International railway station": 0.2, + "Waterloo International": 0.3, + "Waterloo Station": 0.03333333333333333, + "Waterloo International terminal": 0.03333333333333333, + "Waterloo International station": 0.2, + "London terminus": 0.03333333333333333, + "London Waterloo International": 0.06666666666666667 + }, + "Q7242790": { + "London": 0.06896551724137931, + "Gay Pride": 0.034482758620689655, + "Pride London": 0.3793103448275862, + "London Pride Parade": 0.034482758620689655, + "Pride in London": 0.1724137931034483, + "London Pride": 0.13793103448275862, + "London Gay Pride parade": 0.034482758620689655, + "London Gay Pride Festival": 0.034482758620689655, + "Lesbian and Gay Pride": 0.034482758620689655, + "anti-transgender protesters at that year's London Pride event": 0.034482758620689655, + "London Gay Pride Week": 0.034482758620689655 + }, + "Q216185": { + "London": 0.002421307506053269, + "central London": 0.009685230024213076, + "Charing Cross": 0.9685230024213075, + "Charing": 0.004842615012106538, + "centre of London": 0.007263922518159807, + "official centre of London": 0.002421307506053269, + "London's centre point": 0.002421307506053269, + "Charring Cross": 0.002421307506053269 + }, + "Q2018322": { + "London": 0.03225806451612903, + "gay": 0.03225806451612903, + "Old Compton Street": 0.9354838709677419 + }, + "Q720102": { + "London": 0.015789473684210527, + "railway station": 0.002631578947368421, + "London St Pancras": 0.3131578947368421, + "St Pancras": 0.16578947368421051, + "St Pancras railway station": 0.16052631578947368, + "St Pancras International": 0.09736842105263158, + "London St Pancras International": 0.09736842105263158, + "St. Pancras": 0.021052631578947368, + "St. Pancras International": 0.007894736842105263, + "St Pancras International station": 0.007894736842105263, + "London terminus": 0.002631578947368421, + "London St Pancras International station": 0.002631578947368421, + "St Pancras Station": 0.02631578947368421, + "St Pancras station": 0.039473684210526314, + "London St. Pancras": 0.021052631578947368, + "a new station underneath St Pancras": 0.002631578947368421, + "St. Pancras Station": 0.002631578947368421, + "Barlow train shed": 0.002631578947368421, + "(St Pancras) (STP)": 0.002631578947368421, + "London-St Pancras": 0.002631578947368421, + "its international railway station": 0.002631578947368421, + "London St Pancras station": 0.002631578947368421 + }, + "Q23306": { + "London": 0.04216216216216216, + "Greater London": 0.9448648648648649, + "London's": 0.001081081081081081, + "London region": 0.005405405405405406, + "Greater London's": 0.003243243243243243, + "South East London": 0.001081081081081081, + "London conurbation": 0.001081081081081081, + "Greater London county": 0.001081081081081081 + }, + "Q1449564": { + "London": 0.7142857142857142, + "London station": 0.2857142857142857 + }, + "Q733210": { + "London": 0.09090909090909091, + "1948": 0.18181818181818182, + "1948 Summer Olympics": 0.09090909090909091, + "Olympic tournament": 0.09090909090909091, + "1948 Olympic Games": 0.18181818181818182, + "1948 Olympics": 0.09090909090909091, + "Olympic basketball": 0.09090909090909091, + "at the 1948 Summer Olympics": 0.09090909090909091, + "1948 Olympics in London": 0.09090909090909091 + }, + "Q14710970": { + "London": 0.7272727272727273, + "other community of the same name": 0.09090909090909091, + "London, Texas": 0.18181818181818182 + }, + "Q1001456": { + "London": 0.6071428571428571, + "London, Ohio": 0.39285714285714285 + }, + "Q503516": { + "London": 0.07142857142857142, + "Laurel": 0.17857142857142855, + "Laurel County": 0.5714285714285714, + "Laurel Counties": 0.03571428571428571, + "Laurel County, Kentucky": 0.14285714285714285 + }, + "Q8982": { + "London": 0.007352941176470588, + "City": 0.007352941176470588, + "London City Airport": 0.8014705882352942, + "City Airport": 0.007352941176470588, + "London City": 0.125, + "London-City": 0.029411764705882353, + "London City airport": 0.007352941176470588, + "London\u2013City": 0.014705882352941176 + }, + "Q22059065": { + "London": 1.0 + }, + "Q8712": { + "London": 0.012121212121212121, + "Luton": 0.23030303030303031, + "Luton Airport": 0.3151515151515151, + "London Luton Airport": 0.22424242424242424, + "the airport": 0.012121212121212121, + "Luton airport": 0.01818181818181818, + "local airport": 0.006060606060606061, + "London Luton": 0.12121212121212122, + "London\u2013Luton": 0.012121212121212121, + "Luton International Airport": 0.006060606060606061, + "London (Luton)": 0.006060606060606061, + "London-Luton": 0.030303030303030304, + "London Luton Airport Ltd": 0.006060606060606061 + }, + "Q20657974": { + "London": 1.0 + }, + "Q565521": { + "London": 0.012345679012345678, + "Clarence House": 0.9753086419753085, + "Clarence": 0.012345679012345678 + }, + "Q238587": { + "London": 0.0026595744680851063, + "National Portrait Gallery": 0.8031914893617021, + "National Portrait Gallery, London": 0.1622340425531915, + "in London": 0.0026595744680851063, + "National Portrait Gallery's": 0.0026595744680851063, + "National Portrait Gallery in London": 0.005319148936170213, + "Portrait Gallery": 0.0026595744680851063, + "it Gallery": 0.0026595744680851063, + "The National Portrait Gallery": 0.005319148936170213, + "National Portrait Gallery (United Kingdom)": 0.0026595744680851063, + "London's National Portrait Gallery": 0.0026595744680851063, + "National Portrait Gallery (London)": 0.005319148936170213 + }, + "Q122744": { + "London": 0.007462686567164179, + "Maida Vale": 0.8582089552238806, + "Maida Vale Studios": 0.007462686567164179, + "Little Venice": 0.06716417910447761, + "Maida Hill": 0.03731343283582089, + "BBC studios": 0.007462686567164179, + "Maida Vale studios": 0.007462686567164179, + "Maida Vale 4": 0.007462686567164179 + }, + "Q15179170": { + "London": 0.14285714285714285, + "Alexandra Palace transmitter": 0.14285714285714285, + "home": 0.14285714285714285, + "Alexandra Palace": 0.42857142857142855, + "the transmitter at Alexandra Palace": 0.14285714285714285 + }, + "Q10818": { + "London": 0.008097165991902834, + "July 7, 2005 London bombings": 0.012145748987854251, + "7 July 2005 London bombings": 0.4534412955465587, + "7": 0.004048582995951417, + "four coordinated terrorist attacks": 0.004048582995951417, + "2005 London bombings": 0.048582995951417005, + "series of terrorist attacks": 0.004048582995951417, + "terrorist attack": 0.012145748987854251, + "bombings": 0.008097165991902834, + "London underground bombing": 0.004048582995951417, + "bombings on the underground and bus systems": 0.004048582995951417, + "7 July Review Committee": 0.004048582995951417, + "a series of four suicide bombings": 0.004048582995951417, + "London bombings": 0.05668016194331984, + "7/7": 0.020242914979757085, + "terrorist bombings in London": 0.004048582995951417, + "7 July 2005 London Bombings": 0.004048582995951417, + "bombs exploded": 0.004048582995951417, + "day's London bombings": 0.004048582995951417, + "7 July 2005 bombing": 0.004048582995951417, + "7 July 2005 terrorist attacks": 0.004048582995951417, + "\"7/7\" bombings": 0.004048582995951417, + "terrorist bombings": 0.004048582995951417, + "July 2005": 0.008097165991902834, + "July\u00a07, 2005": 0.004048582995951417, + "7/7 London bombings": 0.012145748987854251, + "terrorist attacks on London Transport": 0.004048582995951417, + "7/7 bombings, London": 0.004048582995951417, + "London bombings, 2005": 0.004048582995951417, + "'7/7'": 0.004048582995951417, + "7/7 terror attackers": 0.004048582995951417, + "7 July bombings": 0.004048582995951417, + "London bombings in July 2005": 0.004048582995951417, + "7/7 terrorist attacks": 0.008097165991902834, + "London in 2005": 0.004048582995951417, + "London Bombings": 0.004048582995951417, + "7 July terrorist attacks": 0.004048582995951417, + "bus-bombing of 7 July 2005": 0.004048582995951417, + "2005 London terrorist bombings": 0.004048582995951417, + "7 July London bombings": 0.020242914979757085, + "7 July": 0.016194331983805668, + "2005 London Bombings": 0.004048582995951417, + "London bombings of 7 July 2005": 0.008097165991902834, + "7 July 2005 bombings": 0.012145748987854251, + "videotaped statements of the 7 July 2005 London bombers": 0.004048582995951417, + "two terrorist bombings": 0.008097165991902834, + "7 July 2005": 0.024291497975708502, + "7 July 2005 bombers": 0.004048582995951417, + "London bombings on 7 July 2005": 0.004048582995951417, + "terrorist bombings in London on July 7, 2005": 0.004048582995951417, + "July London bombings": 0.004048582995951417, + "7/7 bombings": 0.012145748987854251, + "July 7 bombings": 0.008097165991902834, + "7 July terror attacks": 0.004048582995951417, + "July 2005 London bombings": 0.004048582995951417, + "7 July 2005 London bombers": 0.004048582995951417, + "July 7, 2005 bombings": 0.004048582995951417, + "London Underground bombings of July 7, 2005": 0.004048582995951417, + "July 7 London bombings": 0.008097165991902834, + "2005 London Tube bombings": 0.004048582995951417, + "suicide bombers of 7/7": 0.004048582995951417, + "three bombs exploded": 0.004048582995951417, + "terrorist attacks in London": 0.008097165991902834, + "terrorist bombings in London on 7 July 2005": 0.004048582995951417, + "2005 London Underground bombings": 0.004048582995951417, + "2005 terrorist bombings in London": 0.004048582995951417, + "July 7th London bombings": 0.004048582995951417, + "London bombings in 2005": 0.004048582995951417, + "7 July 2005 London bombing": 0.004048582995951417, + "bomb attacks in London": 0.004048582995951417, + "London bombings of July 7, 2005": 0.004048582995951417, + "bombings on London's transport system": 0.004048582995951417, + "7 July 2005 attacks on London": 0.004048582995951417, + "7 July 2005 bombings in London": 0.004048582995951417, + "London 7/7": 0.004048582995951417, + "7/7 terrorist attacks in London": 0.004048582995951417, + "7/7 London": 0.004048582995951417 + }, + "Q649419": { + "London": 0.013157894736842105, + "Marylebone": 0.256578947368421, + "London Marylebone": 0.5789473684210527, + "Marylebone station": 0.11184210526315788, + "Marylebone Station": 0.02631578947368421, + "Marylebone Railway Station": 0.006578947368421052, + "London (Marylebone)": 0.006578947368421052 + }, + "Q15242653": { + "London": 0.06666666666666667, + "London Museum": 0.9333333333333333 + }, + "Q6669738": { + "London": 1.0 + }, + "Q756819": { + "London": 0.003125, + "the Strand": 0.134375, + "Strand": 0.596875, + "The Strand": 0.15625, + "Strand, London": 0.08125, + "the street": 0.003125, + "the Strand in London": 0.003125, + "The Strand, London": 0.0125, + "the street in London": 0.003125, + "142 Strand, London": 0.003125, + "West Strand, London": 0.003125 + }, + "Q7443327": { + "London": 0.0625, + "Second Great Fire of London": 0.8125, + "blitz on London": 0.0625, + "one of the greatest fires in London's history": 0.0625 + }, + "Q4642035": { + "London": 0.16666666666666666, + "Baker Street": 0.16666666666666666, + "64 Baker Street": 0.6666666666666666 + }, + "Q729177": { + "London": 0.11764705882352941, + "Cleopatra's Needle": 0.8823529411764706 + }, + "Q1399178": { + "London": 0.125, + "Fazl Mosque": 0.75, + "mosque in London": 0.125 + }, + "Q5645763": { + "London": 0.14285714285714285, + "Hammersmith": 0.14285714285714285, + "Hammersmith bus station": 0.7142857142857142 + }, + "Q801124": { + "London": 0.015463917525773196, + "London Liverpool Street": 0.538659793814433, + "Liverpool Street": 0.21649484536082475, + "Liverpool Street station": 0.17010309278350516, + "Liverpool Street Station": 0.030927835051546393, + "Bishopsgate station": 0.005154639175257732, + "Liverpool Street railway station": 0.010309278350515464, + "Liverpool Street Railway station": 0.002577319587628866, + "London Liverpool Street station": 0.007731958762886598, + "London Liverpool Street Station": 0.002577319587628866 + }, + "Q7737135": { + "London": 0.16666666666666666, + "assay office": 0.16666666666666666, + "the Goldsmiths' Company Assay Office": 0.16666666666666666, + "The Goldsmiths' Company Assay Office": 0.5 + }, + "Q4834838": { + "London": 0.006211180124223602, + "Radio London": 0.037267080745341616, + "GLR": 0.05590062111801242, + "Greater London Radio": 0.043478260869565216, + "BBC London 94.9": 0.18012422360248445, + "BBC Radio London": 0.5714285714285714, + "BBC Greater London Radio": 0.006211180124223602, + "BBC London Live": 0.012422360248447204, + "BBC GLR": 0.08074534161490683, + "BBC Radio London 94.9": 0.006211180124223602 + }, + "Q17509255": { + "London": 0.09090909090909091, + "Chiswell Street": 0.9090909090909092 + }, + "Q800753": { + "London": 0.01020408163265306, + "Fenchurch Street": 0.4183673469387755, + "Fenchurch Street railway station": 0.1530612244897959, + "Fenchurch Street station": 0.0510204081632653, + "London Fenchurch Street": 0.32653061224489793, + "Fenchurch Street railway line": 0.01020408163265306, + "Fenchurch Street Station": 0.030612244897959183 + }, + "Q6671078": { + "London": 0.12, + "London's successful bid": 0.12, + "2012 Olympics and Paralympics": 0.04, + "London 2012": 0.04, + "London bid committee": 0.04, + "London bid for the 2012 Summer Olympics": 0.12, + "winning bid for the 2012 Olympic Games": 0.04, + "2012 Olympics bid": 0.04, + "London to host the 2012 Olympics": 0.04, + "London 2012 Olympic bid": 0.16, + "London 2012 Olympic Bid": 0.04, + "successful bid": 0.04, + "2012 Olympic bid": 0.04, + "London's 2012 Olympic bid": 0.04, + "London won the bid for the 2012 Summer Olympics": 0.04, + "London's Olympic and Paralympic bid": 0.04 + }, + "Q186309": { + "London": 0.005813953488372093, + "Orlando": 0.005813953488372093, + "Madame Tussauds": 0.8546511627906976, + "Madame Tussaud's": 0.05232558139534883, + "Madame Tussauds Sydney": 0.005813953488372093, + "Madame Tussauds Wax Museums": 0.005813953488372093, + "Madame Tussaud Wax Museum": 0.005813953488372093, + "an exhibition": 0.005813953488372093, + "Chamber of Horrors": 0.011627906976744186, + "Madame Tussaud's Wax Museum": 0.005813953488372093, + "Madame Tussauds Wax Works": 0.005813953488372093, + "Madame Tussauds Wax Museum": 0.005813953488372093, + "Madame Tussaud's wax museum": 0.005813953488372093, + "Madame Tussauds London": 0.011627906976744186, + "Tussaud": 0.011627906976744186 + }, + "Q212883": { + "London": 0.3333333333333333, + "diving": 0.6666666666666666 + }, + "Q195436": { + "London": 0.004366812227074236, + "Tate Modern": 0.004366812227074236, + "Tate Gallery": 0.08733624454148471, + "Tate": 0.021834061135371178, + "Tate Britain": 0.8253275109170305, + "The Tate": 0.004366812227074236, + "Tate Gallery, London": 0.004366812227074236, + "Clore Gallery": 0.008733624454148471, + "National Gallery of British Art": 0.013100436681222707, + "Millbank Gallery": 0.004366812227074236, + "Tate Library": 0.004366812227074236, + "Tate Britain, London": 0.004366812227074236, + "Tate Britain art gallery": 0.004366812227074236, + "Tate collections": 0.004366812227074236, + "Tate Archive": 0.004366812227074236 + }, + "Q5038252": { + "London": 1.0 + }, + "Q83609": { + "London": 0.008, + "Acton": 0.856, + "Acton, West London": 0.016, + "Acton, London": 0.10400000000000001, + "East Acton": 0.008, + "Acton hospital": 0.008 + }, + "Q79348": { + "London": 1.0 + }, + "Q4801470": { + "London": 0.024390243902439025, + "Arts Educational Schools, London": 0.12195121951219512, + "ArtsEd": 0.0975609756097561, + "Arts Educational Schools": 0.1951219512195122, + "The Arts Educational Schools": 0.04878048780487805, + "Arts Educational School": 0.3902439024390244, + "the Arts Educational School": 0.024390243902439025, + "Arts Educational School in London": 0.024390243902439025, + "Arts Educational Schools London": 0.04878048780487805, + "Arts Educational Trust School": 0.024390243902439025 + }, + "Q220198": { + "London": 0.0055248618784530384, + "Zoological Society of London": 0.7458563535911602, + "London Society": 0.0055248618784530384, + "Zoological Society": 0.08839779005524862, + "Zoological Gardens": 0.0055248618784530384, + "Zoological": 0.027624309392265192, + "London Zoological Society": 0.03867403314917127, + "Regent's Park Zoological Gardens in London": 0.0055248618784530384, + "FZS": 0.055248618784530384, + "Fellow of the Zoological Society": 0.0055248618784530384, + "Zoological Society of London (ZSL)": 0.0055248618784530384, + "Zoological societies": 0.0055248618784530384, + "F.Z.S.": 0.0055248618784530384 + }, + "Q124234": { + "London": 0.009615384615384616, + "St James's": 0.701923076923077, + "St. James's": 0.19230769230769232, + "St James": 0.009615384615384616, + "Parish of St James": 0.009615384615384616, + "St James's Parish": 0.009615384615384616, + "Regent Street St. James (Lower Regent Street)": 0.009615384615384616, + "St James, London": 0.009615384615384616, + "St James Place": 0.009615384615384616, + "Regent Street St James (Lower Regent Street)": 0.009615384615384616, + "St James's, London": 0.009615384615384616, + "St. James's, London": 0.009615384615384616, + "St.\u00a0James's": 0.009615384615384616 + }, + "Q23298": { + "London": 0.0004050222762251924, + "English county": 0.0004050222762251924, + "Kent": 0.9732685297691374, + "County": 0.0004050222762251924, + "Kent County": 0.0004050222762251924, + "Kent, England": 0.005265289590927502, + "modern county": 0.0004050222762251924, + "the County of Kent": 0.0004050222762251924, + "County of Kent": 0.014985824220332119, + "Garden of England": 0.0004050222762251924, + "West Kent": 0.0004050222762251924, + "county of Kent": 0.0008100445524503848, + "Kent County, England": 0.0008100445524503848, + "County Kent": 0.0004050222762251924, + "Kent (England)": 0.0004050222762251924, + "County Of Kent": 0.0004050222762251924, + "Kent, England.": 0.0004050222762251924 + }, + "Q1431914": { + "London": 0.011904761904761904, + "Croydon": 0.16666666666666666, + "Croydon Airport": 0.6785714285714285, + "Airport House": 0.011904761904761904, + "Croydon airfield": 0.011904761904761904, + "Croydon Airfield": 0.011904761904761904, + "Croydon airport": 0.023809523809523808, + "original London Airport at Croydon": 0.011904761904761904, + "Croydon Aerodrome": 0.03571428571428571, + "London's international airport": 0.011904761904761904, + "RAF Croydon": 0.023809523809523808 + }, + "Q835031": { + "London": 0.0625, + "German Embassy": 0.0625, + "German Ambassador": 0.1875, + "German Embassy in London": 0.125, + "German embassy in London": 0.0625, + "German ambassador to the United Kingdom": 0.0625, + "Embassy of Germany": 0.0625, + "German Ambassador to the United Kingdom": 0.0625, + "London embassy": 0.0625, + "Embassy of the Federal Republic of Germany": 0.0625, + "German ambassador to Britain": 0.0625, + "Embassy of Germany to the U.K.": 0.0625, + "German ambassador": 0.0625 + }, + "Q1323689": { + "London": 0.016666666666666666, + "London International Film Festival": 0.041666666666666664, + "London Film Festival": 0.6666666666666666, + "BFI London Film Festival": 0.24166666666666667, + "British Film Institute Awards": 0.008333333333333333, + "London Film Fest": 0.008333333333333333, + "The Times BFI London Film Festival": 0.016666666666666666 + }, + "Q7594521": { + "London": 0.16666666666666666, + "St Mary's Roman Catholic Church": 0.3333333333333333, + "Our Immaculate Lady of Victories": 0.3333333333333333, + "Clapham parish": 0.16666666666666666 + }, + "Q26888": { + "London": 0.006756756756756757, + "Croydon": 0.2702702702702703, + "London Borough of Croydon": 0.6486486486486487, + "Croydon Council": 0.0472972972972973, + "London Borough": 0.006756756756756757, + "borough of Croydon": 0.006756756756756757, + "London Boroughs of Croydon": 0.006756756756756757, + "Borough of Croydon": 0.006756756756756757 + }, + "Q8709": { + "London": 0.007751937984496124, + "Stansted Airport": 0.3178294573643411, + "London Stansted": 0.13953488372093023, + "London Stansted Airport": 0.25193798449612403, + "Stansted": 0.2131782945736434, + "London-Stansted": 0.031007751937984496, + "proposed expansion": 0.003875968992248062, + "London\u2013Stansted": 0.023255813953488372, + "London's third airport": 0.003875968992248062, + "London Stansted Airport's": 0.003875968992248062, + "London-Stansted Airport": 0.003875968992248062 + }, + "Q1402606": { + "London": 0.0625, + "Neasden Temple": 0.375, + "BAPS Shri Swaminarayan Mandir London": 0.25, + "Swaminarayan Temple in Neasden": 0.0625, + "Neasden temple": 0.0625, + "Shri Swaminarayan Mandir": 0.0625, + "BAPS mandir in Neasden": 0.0625, + "Hindu Temple in Neasden": 0.0625 + }, + "Q278054": { + "London": 0.14285714285714285, + "Diocese of London": 0.2857142857142857, + "Roman Catholic Diocese of London": 0.14285714285714285, + "Bishop of London, Ontario": 0.42857142857142855 + }, + "Q801125": { + "London": 0.005291005291005291, + "London Bridge": 0.7777777777777777, + "London Bridge Station": 0.05291005291005291, + "London Bridge station": 0.12169312169312169, + "London Bridge railway station": 0.037037037037037035, + "redevelopment of London Bridge": 0.005291005291005291 + }, + "Q6669870": { + "London": 0.05555555555555555, + "London Book Fair": 0.8888888888888888, + "exclusion of dissident Chinese writers from the London Book Fair 2012": 0.05555555555555555 + }, + "Q1666958": { + "London": 0.03571428571428571, + "London International Surrealist Exhibition": 0.5357142857142857, + "International Surrealist Exhibition": 0.3214285714285714, + "1936 Surrealist Exhibition in London": 0.03571428571428571, + "International Exhibition of Surrealism": 0.03571428571428571, + "International Surrealist Exhibition in London": 0.03571428571428571 + }, + "Q5011830": { + "London": 0.25, + "CIQM-FM": 0.75 + }, + "Q772421": { + "London": 0.025, + "St George's": 0.05, + "St. George's University": 0.025, + "St George's Hospital": 0.025, + "St. George's Medical School": 0.025, + "St George's, University of London": 0.375, + "St George's Hospital Medical School": 0.275, + "St. George's Hospital": 0.025, + "St George's Medical School": 0.025, + "Grosvenor School of Medicine": 0.025, + "St George's (University of London)": 0.025, + "St George's Hospita": 0.025, + "St. George's Hospital Medical School": 0.05, + "St. George's University of London": 0.025 + }, + "Q1749569": { + "London": 0.16666666666666666, + "Ny-London": 0.8333333333333333 + }, + "Q60578265": { + "London": 0.2857142857142857, + "City": 0.14285714285714285, + "City of London": 0.5714285714285714 + }, + "Q897533": { + "Bramall Lane": 0.9253731343283582, + "Sheffield": 0.03731343283582089, + "John Street Stand": 0.007462686567164179, + "corner infill stand": 0.007462686567164179, + "Bramall Lane Stadium": 0.007462686567164179, + "Brammall Lane": 0.014925373134328358 + }, + "Q21061609": { + "Headingley": 0.8634146341463415, + "Leeds": 0.058536585365853655, + "cricket ground": 0.00975609756097561, + "Headingley, Leeds": 0.004878048780487805, + "Headingley Cricket Ground": 0.058536585365853655, + "home county ground": 0.004878048780487805 + }, + "Q1187032": { + "Headingley": 0.6170212765957447, + "Leeds": 0.06382978723404255, + "Headingley Stadium": 0.2978723404255319, + "Emerald Headingley Stadium": 0.02127659574468085 + }, + "Q39121": { + "Leeds": 0.9868680236375573, + "Leeds city centre": 0.0006565988181221273, + "Cross Gates": 0.0006565988181221273, + "Leeds, England": 0.004596191726854892, + "city of Leeds": 0.0013131976362442547, + "Lord Mayor of Leeds": 0.0013131976362442547, + "Leeds, Yorkshire": 0.0026263952724885093, + "Leeds, West Yorkshire": 0.0006565988181221273, + "Leeds, United Kingdom": 0.0006565988181221273, + "Leeds, UK": 0.0006565988181221273 + }, + "Q1128631": { + "Leeds": 0.020114942528735632, + "Leeds United": 0.9310344827586207, + "Leeds United A.F.C.": 0.012452107279693486, + "Leeds United F.C.": 0.01532567049808429, + "Leeds United Football Club": 0.008620689655172414, + "Leeds United FC": 0.008620689655172414, + "Leeds United's": 0.0028735632183908046, + "Leeds United's greatest team": 0.0009578544061302681 + }, + "Q774015": { + "Leeds": 0.20930232558139533, + "City of Leeds": 0.7209302325581395, + "Leeds City Council": 0.011627906976744186, + "Leeds City": 0.011627906976744186, + "City of Leeds Metropolitan District": 0.023255813953488372, + "City of Leeds Borough": 0.011627906976744186, + "Greater Leeds": 0.011627906976744186 + }, + "Q503424": { + "Leeds": 0.06382978723404255, + "University": 0.00425531914893617, + "University of Leeds": 0.7468085106382979, + "Institute of Education": 0.002127659574468085, + "Leeds University": 0.1595744680851064, + "Yorkshire College": 0.006382978723404256, + "Leeds University Business School": 0.002127659574468085, + "Leeds School of Architecture": 0.002127659574468085, + "Yorkshire College, Leeds": 0.00425531914893617, + "The University of Leeds": 0.002127659574468085, + "University campus": 0.002127659574468085, + "Yorkshire College of Science": 0.002127659574468085, + "Leeds Institute of Education": 0.002127659574468085 + }, + "Q1137962": { + "Leeds": 0.004310344827586207, + "Reading": 0.05818965517241379, + "Leeds Festival": 0.036637931034482756, + "Reading Festival": 0.41594827586206895, + "Reading and Leeds Festivals": 0.30387931034482757, + "Reading and Leeds Festival": 0.05603448275862069, + "Windsor Jazz and Blues Festival": 0.0021551724137931034, + "Reading and Leeds": 0.01939655172413793, + "Reading & Leeds": 0.0021551724137931034, + "Carling Weekend": 0.023706896551724137, + "Reading Rock Festival": 0.00646551724137931, + "Reading and Leeds festival": 0.01293103448275862, + "2010 Reading and Leeds Festivals": 0.0021551724137931034, + "Reading Music Festival": 0.010775862068965518, + "Reading festival": 0.004310344827586207, + "Carling Festival": 0.00646551724137931, + "Reading and Leeds festivals": 0.01939655172413793, + "The Reading and Leeds Festivals": 0.0021551724137931034, + "Reading and Leeds music festivals": 0.0021551724137931034, + "Reading/Leeds Festivals": 0.0021551724137931034, + "The Carling Weekend": 0.004310344827586207, + "Reading/Leeds": 0.0021551724137931034, + "Reading And Leeds": 0.0021551724137931034 + }, + "Q6515934": { + "Leeds": 0.375, + "Leeds City bus station": 0.625 + }, + "Q4834918": { + "Leeds": 0.06060606060606061, + "BBC Radio Leeds": 0.8484848484848485, + "BBC Leeds": 0.030303030303030304, + "Radio Leeds": 0.06060606060606061 + }, + "Q7721041": { + "Leeds": 1.0 + }, + "Q482468": { + "Leeds": 1.0 + }, + "Q2460124": { + "Leeds": 0.6666666666666666, + "Leeds, Kent": 0.3333333333333333 + }, + "Q79869": { + "Leeds": 0.5833333333333333, + "Leeds, Alabama": 0.41666666666666663 + }, + "Q746876": { + "Leeds": 0.08333333333333333, + "Leeds Castle": 0.9166666666666666 + }, + "Q6515805": { + "Leeds": 0.5, + "neighborhood of Leeds": 0.125, + "Leeds, Massachusetts": 0.375 + }, + "Q3461415": { + "Leeds": 1.0 + }, + "Q2365261": { + "Leeds": 1.0 + }, + "Q7746609": { + "Leeds": 0.05263157894736842, + "studios": 0.10526315789473684, + "The Leeds Studios": 0.47368421052631576, + "the Leeds Studios": 0.05263157894736842, + "Leeds Studios": 0.2631578947368421, + "Kirkstall Road studios": 0.05263157894736842 + }, + "Q14875251": { + "Leeds": 0.2, + "Leeds County": 0.7000000000000001, + "Leeds County, Ontario": 0.1 + }, + "Q6515927": { + "Leeds": 0.1111111111111111, + "Leeds City Region": 0.7777777777777777, + "Leeds City Region Partnership": 0.1111111111111111 + }, + "Q5177618": { + "Leeds": 0.14285714285714285, + "Leeds Corporation": 0.14285714285714285, + "County Borough of Leeds": 0.7142857142857142 + }, + "Q27985411": { + "Leeds": 1.0 + }, + "Q6515866": { + "Leeds": 0.3333333333333333, + "Leeds Central": 0.6666666666666666 + }, + "Q871138": { + "Leeds": 0.15384615384615385, + "Diocese of Leeds": 0.38461538461538464, + "Roman Catholic Diocese of Leeds": 0.46153846153846156 + }, + "Q4763489": { + "Leeds": 0.029411764705882353, + "Diocese of Leeds": 0.7647058823529411, + "Anglican Diocese of Leeds": 0.11764705882352941, + "of Leeds": 0.058823529411764705, + "Leeds diocese": 0.029411764705882353 + }, + "Q4871546": { + "Leeds": 1.0 + }, + "Q24896243": { + "Leeds": 0.25, + "Elland Road Greyhound Stadium": 0.75 + }, + "Q489255": { + "Leeds": 0.007462686567164179, + "Sioux City": 0.4888059701492537, + "Sioux City, Iowa": 0.5, + "larger neighbor": 0.0037313432835820895 + }, + "Q3228965": { + "Leeds": 0.022727272727272728, + "Leeds Arts University": 0.20454545454545456, + "Leeds School of Art": 0.06818181818181818, + "Leeds College of Art": 0.4772727272727273, + "Jacob Kramer College": 0.11363636363636365, + "Leeds College of Art (Leeds Arts University)": 0.022727272727272728, + "Leeds Arts": 0.022727272727272728, + "Leeds College of Art and Design": 0.045454545454545456, + "Jacob Kramer College of Art": 0.022727272727272728 + }, + "Q209266": { + "Leeds": 0.6666666666666666, + "Leeds Junction": 0.08333333333333333, + "Leeds, Maine": 0.25 + }, + "Q42448": { + "Sheffield": 0.9401459854014598, + "Sheffield sound": 0.00072992700729927, + "City of Sheffield": 0.03941605839416058, + "home town": 0.00072992700729927, + "Sheffield, England": 0.012408759124087591, + "Sheffield, South Yorkshire": 0.00145985401459854, + "Sheffield, Yorkshire, England": 0.00072992700729927, + "City and County Borough of Sheffield": 0.00072992700729927, + "Sheffield, West Riding of Yorkshire": 0.00145985401459854, + "Sheffield, Yorkshire": 0.00072992700729927, + "Sheffield District": 0.00072992700729927, + "Sheffield's": 0.00072992700729927 + }, + "Q7492778": { + "Sheffield": 0.3448275862068966, + "Sheffield Victoria railway station": 0.13793103448275862, + "Sheffield Victoria": 0.41379310344827586, + "Sheffield Victoria station": 0.06896551724137931, + "Sheffield Victoria Station": 0.034482758620689655 + }, + "Q7492565": { + "Sheffield": 1.0 + }, + "Q1862179": { + "Sheffield": 0.7341772151898734, + "station": 0.006329113924050633, + "Pond Street": 0.006329113924050633, + "Sheffield station": 0.09493670886075949, + "Sheffield Station": 0.0189873417721519, + "Sheffield Railway Station": 0.006329113924050633, + "Sheffield Midland station": 0.0189873417721519, + "Sheffield railway station": 0.06962025316455696, + "Sheffield Midland Station": 0.012658227848101266, + "Sheffield Midland": 0.02531645569620253, + "Sheffield (Midland)": 0.006329113924050633 + }, + "Q823917": { + "Sheffield": 0.0851063829787234, + "University of Sheffield": 0.7234042553191489, + "The University of Sheffield": 0.014184397163120567, + "Sheffield University": 0.14893617021276595, + "the University of Sheffield": 0.0035460992907801418, + "universities of Sheffield": 0.0035460992907801418, + "Sheffield University's": 0.0035460992907801418, + "Tapton Hall of Residence": 0.0035460992907801418, + "University College of Sheffield": 0.0035460992907801418, + "Firth College": 0.0070921985815602835, + "Sheffield University.": 0.0035460992907801418 + }, + "Q4834926": { + "Sheffield": 0.08571428571428572, + "BBC Radio Sheffield": 0.8285714285714285, + "Radio Sheffield": 0.08571428571428572 + }, + "Q17643392": { + "Sheffield": 0.05263157894736842, + "Manor House": 0.05263157894736842, + "Sheffield Manor Lodge": 0.15789473684210525, + "Sheffield Manor": 0.5789473684210527, + "Manor Lodge": 0.15789473684210525 + }, + "Q2306176": { + "Sheffield": 0.5740740740740741, + "Sheffield, Massachusetts": 0.35185185185185186, + "Ashley Falls": 0.037037037037037035, + "Ashley Falls, Massachusetts": 0.037037037037037035 + }, + "Q7492570": { + "Sheffield": 1.0 + }, + "Q1950928": { + "Sheffield": 0.6470588235294118, + "Sheffield, Vermont": 0.29411764705882354, + "Sheffield Heights": 0.058823529411764705 + }, + "Q2277715": { + "Sheffield": 0.7857142857142857, + "Sheffield, Tasmania": 0.21428571428571427 + }, + "Q79568": { + "Sheffield": 0.48484848484848486, + "Sheffield, Alabama": 0.5151515151515151 + }, + "Q518864": { + "Sheffield": 0.7272727272727273, + "Sheffield, Illinois": 0.2727272727272727 + }, + "Q7492591": { + "Sheffield": 0.30000000000000004, + "Sheffield Blitz": 0.7000000000000001 + }, + "Q7492775": { + "Sheffield": 0.2, + "Sheffield Township": 0.8 + }, + "Q741640": { + "Sheffield": 0.25, + "Wheel of Sheffield": 0.75 + }, + "Q7492686": { + "Sheffield": 0.125, + "Sheffield Interchange": 0.75, + "the bus interchange": 0.125 + }, + "Q3577611": { + "Sheffield": 0.1, + "Sheffield Lock": 0.5, + "Sheffield Bottom": 0.4 + }, + "Q12956644": { + "Sheffield": 0.34375, + "City of Sheffield": 0.59375, + "district borough": 0.03125, + "Sheffield Council": 0.03125 + }, + "Q547824": { + "Sheffield": 0.047619047619047616, + "HMS \"Sheffield\"": 0.38095238095238093, + "\"Sheffield\"": 0.047619047619047616, + "HMS Sheffield": 0.5238095238095237 + }, + "Q7492719": { + "Sheffield": 1.0 + }, + "Q7492566": { + "Sheffield": 1.0 + }, + "Q7492567": { + "Sheffield": 1.0 + }, + "Q4523493": { + "Sheffield": 0.125, + "Sheffield Built-up Area": 0.125, + "Sheffield built-up area": 0.25, + "Sheffield urban area": 0.5 + }, + "Q3028626": { + "Sheffield": 0.08333333333333333, + "Diocese of Sheffield": 0.75, + "diocese of Sheffield": 0.16666666666666666 + }, + "Q7492607": { + "Sheffield": 0.030303030303030304, + "city centre": 0.09090909090909091, + "quarters": 0.030303030303030304, + "City Centre": 0.06060606060606061, + "Sheffield City Centre": 0.42424242424242425, + "Sheffield city centre": 0.33333333333333337, + "Sheffield's city centre": 0.030303030303030304 + }, + "Q3365926": { + "Sheffield": 0.5714285714285714, + "Sheffield, New Brunswick": 0.42857142857142855 + }, + "Q7492568": { + "Sheffield": 1.0 + }, + "Q108940076": { + "Sheffield": 0.75, + "village of Sheffield": 0.25 + }, + "Q1184547": { + "Sheffield": 0.8181818181818182, + "Sheffield, PA": 0.18181818181818182 + }, + "Q1984238": { + "Sheffield": 0.6153846153846154, + "Sheffield, Ohio": 0.38461538461538464 + } +} \ No newline at end of file diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py index 238edb66..6b5570ad 100644 --- a/tests/test_data_processing.py +++ b/tests/test_data_processing.py @@ -6,11 +6,13 @@ from pathlib import Path import pandas as pd +import pytest -large_resources = "/resources/" # path to large resources -small_resources = "resources/" # path to small resources -processed_path_lwm = "experiments/outputs/data/lwm/" # path to processed LwM data -processed_path_hipe = "experiments/outputs/data/hipe/" # path to processed LwM data +current_dir = Path(__file__).parent.resolve() + +small_resources = os.path.join(current_dir,"sample_files/resources/") # path to small resources +processed_path_lwm = os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/") # path to processed LwM data +processed_path_hipe = os.path.join(current_dir,"sample_files/experiments/outputs/data/hipe/") # path to processed LwM data def test_publication_metadata_exists(): @@ -53,8 +55,8 @@ def test_original_lwm_data(): train_metadata = pd.read_csv(path_train_metadata, sep="\t") test_metadata = pd.read_csv(path_test_metadata, sep="\t") # Assert the size of the metadata files: - assert train_metadata.shape[0] == 343 - assert test_metadata.shape[0] == 112 + assert train_metadata.shape[0] == 1 + assert test_metadata.shape[0] == 1 assert train_metadata.shape[1] == 10 assert test_metadata.shape[1] == 10 # Items in metadata match number of files in directory, for test: @@ -98,8 +100,8 @@ def test_lwm_ner_conversion_fine(): dtype={"id": str}, ) # Assert size of the train and dev sets: - assert df_ner_train.shape == (5216, 3) - assert df_ner_dev.shape == (1304, 3) + assert df_ner_train.shape == (141, 3) + assert df_ner_dev.shape == (41, 3) # Assert number of sentences in train and dev (length of list and set should be the same): assert ( len(list(df_ner_train["id"]) + list(df_ner_dev["id"])) @@ -107,45 +109,11 @@ def test_lwm_ner_conversion_fine(): == df_ner_train.shape[0] + df_ner_dev.shape[0] ) # Assert ID is read as string: - assert type(df_ner_train["id"].iloc[0]) == str + assert isinstance(df_ner_train["id"].iloc[0],str) # Assert number of unique articles: train_articles = [x.split("_")[0] for x in list(df_ner_train["id"])] dev_articles = [x.split("_")[0] for x in list(df_ner_dev["id"])] - assert len(set(train_articles + dev_articles)) == 343 - - -def test_lwm_ner_conversion_coarse(): - """ - Test process_lwm_for_ner is not missing articles. - """ - df_ner_train = pd.read_json( - os.path.join(f"{processed_path_lwm}", "ner_coarse_train.json"), - orient="records", - lines=True, - dtype={"id": str}, - ) - df_ner_dev = pd.read_json( - os.path.join(f"{processed_path_lwm}", "ner_coarse_dev.json"), - orient="records", - lines=True, - dtype={"id": str}, - ) - # Assert size of the train and dev sets: - assert df_ner_train.shape == (5216, 3) - assert df_ner_dev.shape == (1304, 3) - # Assert number of sentences in train and dev (length of list and set should be the same): - assert ( - len(list(df_ner_train["id"]) + list(df_ner_dev["id"])) - == len(set(list(df_ner_train["id"]) + list(df_ner_dev["id"]))) - == df_ner_train.shape[0] + df_ner_dev.shape[0] - ) - # Assert ID is read as string: - assert type(df_ner_train["id"].iloc[0]) == str - # Assert number of unique articles: - train_articles = [x.split("_")[0] for x in list(df_ner_train["id"])] - dev_articles = [x.split("_")[0] for x in list(df_ner_dev["id"])] - assert len(set(train_articles + dev_articles)) == 343 - + assert len(set(train_articles + dev_articles)) == 11 def test_lwm_linking_conversion(): """ @@ -156,26 +124,26 @@ def test_lwm_linking_conversion(): sep="\t", ) # Assert size of the dataset (i.e. number of articles): - assert df_linking.shape[0] == 455 + assert df_linking.shape[0] == 14 # Assert if place has been filled correctly: for x in df_linking.place: - assert type(x) == str + assert isinstance(x,str) assert x != "" # Assert if place QID has been filled correctly: for x in df_linking.place_wqid: - assert type(x) == str + assert isinstance(x,str) assert x != "" for x in df_linking.annotations: x = literal_eval(x) for ann in x: assert ann["wkdt_qid"] == "NIL" or ann["wkdt_qid"].startswith("Q") - assert df_linking[df_linking["originalsplit"] == "train"].shape[0] == 229 - assert df_linking[df_linking["originalsplit"] == "dev"].shape[0] == 114 - assert df_linking[df_linking["originalsplit"] == "test"].shape[0] == 112 - assert df_linking[df_linking["withouttest"] == "train"].shape[0] == 153 - assert df_linking[df_linking["withouttest"] == "dev"].shape[0] == 76 - assert df_linking[df_linking["withouttest"] == "test"].shape[0] == 114 - assert df_linking[df_linking["withouttest"] == "left_out"].shape[0] == 112 + assert df_linking[df_linking["originalsplit"] == "train"].shape[0] == 10 + assert df_linking[df_linking["originalsplit"] == "dev"].shape[0] == 2 + assert df_linking[df_linking["originalsplit"] == "test"].shape[0] == 2 + assert df_linking[df_linking["withouttest"] == "train"].shape[0] == 8 + assert df_linking[df_linking["withouttest"] == "dev"].shape[0] == 2 + assert df_linking[df_linking["withouttest"] == "test"].shape[0] == 2 + assert df_linking[df_linking["withouttest"] == "left_out"].shape[0] == 2 test_withouttest = set( list(df_linking[df_linking["withouttest"] == "test"].article_id) ) @@ -185,7 +153,7 @@ def test_lwm_linking_conversion(): # Test articles of the original split and without test should not overlap: assert not (test_withouttest & test_originalsplit) - +@pytest.mark.skip(reason="Requires HIPE data") def test_hipe_linking_conversion(): """ Test process_hipe_for_linking is not missing articles. @@ -211,11 +179,11 @@ def test_hipe_linking_conversion(): assert not (test_withouttest & test_originalsplit) # Assert if place has been filled correctly: for x in df_linking.place: - assert type(x) == str + assert isinstance(x,str) assert x != "" # Assert if place QID has been filled correctly: for x in df_linking.place_wqid: - assert type(x) == str + assert isinstance(x,str) assert x != "" # Do HIPE stats match https://github.com/hipe-eval/HIPE-2022-data/blob/main/notebooks/hipe2022-datasets-stats.ipynb number_locs = 0 diff --git a/tests/test_deezy.py b/tests/test_deezy.py new file mode 100644 index 00000000..a72fa630 --- /dev/null +++ b/tests/test_deezy.py @@ -0,0 +1,44 @@ +import os +from pathlib import Path + +import pytest +from DeezyMatch import candidate_ranker + +current_dir = Path(__file__).parent.resolve() + +@pytest.mark.deezy(reason="Needs deezy model") +def test_deezy_match_deezy_candidate_ranker(tmp_path): + deezy_parameters = { + # Paths and filenames of DeezyMatch models and data: + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), + "dm_cands": "wkdtalts", + "dm_model": "w2v_ocr", + "dm_output": "deezymatch_on_the_fly", + # Ranking measures: + "ranking_metric": "faiss", + "selection_threshold": 50, + "num_candidates": 1, + "verbose": False, + # DeezyMatch training: + "overwrite_training": False, + "do_test": False, + } + + dm_path = deezy_parameters["dm_path"] + dm_cands = deezy_parameters["dm_cands"] + dm_model = deezy_parameters["dm_model"] + dm_output = deezy_parameters["dm_output"] + + query = ["-", "ST G", "• - , i", "- P", "• FERRIS"] + + candidates = candidate_ranker( + candidate_scenario=os.path.join(dm_path, "combined", dm_cands + "_" + dm_model), + query=query, + ranking_metric=deezy_parameters["ranking_metric"], + selection_threshold=deezy_parameters["selection_threshold"], + num_candidates=deezy_parameters["num_candidates"], + search_size=deezy_parameters["num_candidates"], + verbose=deezy_parameters["verbose"], + output_path=os.path.join(tmp_path,dm_output), + ) + assert len(candidates) == len(query) diff --git a/tests/test_disambiguation.py b/tests/test_disambiguation.py index 1a206f2f..f9ab0372 100644 --- a/tests/test_disambiguation.py +++ b/tests/test_disambiguation.py @@ -3,22 +3,24 @@ import sys from pathlib import Path +import pytest import pandas as pd +import pytest -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from geoparser import linking, pipeline, ranking, recogniser -from utils import rel_utils -from utils.REL import entity_disambiguation +from t_res.geoparser import linking, pipeline, ranking, recogniser +from t_res.utils import rel_utils +from t_res.utils.REL import entity_disambiguation +current_dir = Path(__file__).parent.resolve() +@pytest.mark.skip(reason="Needs large db file") def test_embeddings(): """ Test embeddings are loaded correctly. """ # Test 1: Check glove embeddings mentions = ["in", "apple"] - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() embs = rel_utils.get_db_emb(cursor, mentions, "snd") assert len(mentions) == len(embs) @@ -43,29 +45,18 @@ def test_embeddings(): embs = rel_utils.get_db_emb(cursor, mentions, "entity") assert embs == [None] - -def test_prepare_initial_data(): - df = pd.read_csv( - "experiments/outputs/data/lwm/linking_df_split.tsv", sep="\t" - ).iloc[:1] - parsed_doc = rel_utils.prepare_initial_data(df) - assert parsed_doc["4939308_1"][0]["mention"] == "STALYBRIDGE" - assert parsed_doc["4939308_1"][0]["gold"][0] == "Q1398653" - assert parsed_doc["4939308_6"][1]["mention"] == "Market-street" - assert parsed_doc["4939308_6"][1]["gold"] == "NIL" - - -def test_train(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_train(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) + model="ner_test", # NER model name prefix (will have suffixes appended) pipe=None, # We'll store the NER pipeline here base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + model_path=str(tmp_path), # Path where the NER model is or will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -76,7 +67,7 @@ def test_train(): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -85,13 +76,13 @@ def test_train(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -105,16 +96,16 @@ def test_train(): "do_test": False, }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: - cursor = conn.cursor() + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: + cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm"), "training_split": "originalsplit", "db_embeddings": cursor, "with_publication": False, @@ -154,18 +145,18 @@ def test_train(): # assert expected performance on test set assert mylinker.rel_params["ed_model"].best_performance["f1"] == 0.6288416075650118 - -def test_load_eval_model(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_load_eval_model(tmp_path): myner = recogniser.Recogniser( model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) pipe=None, # We'll store the NER pipeline here base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + model_path=str(tmp_path), # Path where the NER model is or will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -176,7 +167,7 @@ def test_load_eval_model(): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -185,13 +176,13 @@ def test_load_eval_model(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), + "w2v_ocr_path": str(tmp_path), "w2v_ocr_model": "w2v_*_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -206,18 +197,16 @@ def test_load_eval_model(): }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() - mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm"), "training_split": "originalsplit", - "topn_candidates": 10, "db_embeddings": cursor, "with_publication": False, "without_microtoponyms": False, @@ -253,19 +242,19 @@ def test_load_eval_model(): == entity_disambiguation.EntityDisambiguation ) - -def test_predict(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_predict(tmp_path): myner = recogniser.Recogniser( model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) pipe=None, # We'll store the NER pipeline here base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + model_path=str(tmp_path), # Path where the NER model is or will be stored training_args={ "learning_rate": 5e-5, "batch_size": 16, - "num_train_epochs": 4, + "num_train_epochs": 1, "weight_decay": 0.01, }, overwrite_training=False, # Set to True if you want to overwrite model if existing @@ -275,7 +264,7 @@ def test_predict(): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -284,13 +273,13 @@ def test_predict(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -304,23 +293,21 @@ def test_predict(): "do_test": False, }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: - cursor = conn.cursor() + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: + cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm"), "training_split": "originalsplit", "db_embeddings": cursor, "with_publication": True, "without_microtoponyms": True, "do_test": False, - "default_publname": "United Kingdom", - "default_publwqid": "Q145", }, overwrite_training=False, ) @@ -332,7 +319,7 @@ def test_predict(): place="London", place_wqid="Q84", ) - assert type(predictions) == list + assert isinstance(predictions,list) assert predictions[1]["prediction"] in predictions[1]["cross_cand_score"] diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 825e29db..87fc6fd2 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -1,23 +1,25 @@ import os import sys from ast import literal_eval +from pathlib import Path import pandas as pd import pytest -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) +# Add "../" to path to import experiment +current_dir = Path(__file__).parent.resolve() +sys.path.insert(0, os.path.join(current_dir,"../")) from experiments import experiment -from geoparser import linking, ranking, recogniser +from t_res.geoparser import linking, ranking, recogniser -def test_wrong_dataset_path(): +def test_experiments_wrong_dataset_path(tmp_path): with pytest.raises(SystemExit) as cm: experiment.Experiment( dataset="lwm", data_path="wrong_path/", dataset_df=pd.DataFrame(), - results_path="experiments/outputs/results/", + results_path=str(tmp_path), myner="test", myranker="test", mylinker="test", @@ -30,8 +32,8 @@ def test_wrong_dataset_path(): ) -def test_load_data(): - data = pd.read_csv("experiments/outputs/data/lwm/linking_df_split.tsv", sep="\t") +def test_load_data(tmp_path): + data = pd.read_csv(os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"), sep="\t") ids = set() for idx, row in data.iterrows(): @@ -41,29 +43,34 @@ def test_load_data(): ids.add(str(article_id) + "_" + str(sent["sentence_pos"])) myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - pipe=None, # We'll store the NER pipeline here - base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored - training_args=dict(), + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, + base_model="khosseini/bert_1760_1900", # Base model to fine-tune + model_path=str(tmp_path), # Path where the NER model will be stored + training_args={ + "batch_size": 8, + "num_train_epochs": 1, + "learning_rate": 0.00005, + "weight_decay": 0.0, + }, overwrite_training=False, # Set to True if you want to overwrite model if existing do_test=False, # Set to True if you want to train on test mode - load_from_hub=False, + load_from_hub=False, ) # Instantiate the ranker: myranker = ranking.Ranker( method="perfectmatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) # -------------------------------------- # Instantiate the linker: mylinker = linking.Linker( method="mostpopular", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) myner.train() @@ -78,9 +85,9 @@ def test_load_data(): # Instantiate the experiment: exp = experiment.Experiment( dataset="lwm", - data_path="experiments/outputs/data/", + data_path=os.path.join(current_dir,"sample_files/experiments/outputs/data/"), dataset_df=pd.DataFrame(), - results_path="experiments/outputs/results/", + results_path=str(tmp_path), myner=myner, myranker=myranker, mylinker=mylinker, @@ -121,18 +128,18 @@ def test_load_data(): assert len(not_empty_dMentionsPred) == len(not_empty_dCandidates) -def test_wrong_ranker_method(): +def test_wrong_ranker_method(tmp_path): ranker = ranking.Ranker( # wrong naming: it should be perfectmatch method="perfect_match", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) exp = experiment.Experiment( dataset="lwm", - data_path="experiments/outputs/data/", + data_path=os.path.join(current_dir,"sample_files/experiments/outputs/data/"), dataset_df=pd.DataFrame(), - results_path="experiments/outputs/results/", + results_path=str(tmp_path), myner="test", myranker=ranker, mylinker="test", @@ -142,31 +149,36 @@ def test_wrong_ranker_method(): assert cm.value.code == 0 -def test_apply(): +def test_apply(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - pipe=None, # We'll store the NER pipeline here - base_model="khosseini/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored - training_args=dict(), + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, + base_model="khosseini/bert_1760_1900", # Base model to fine-tune + model_path=str(tmp_path), # Path where the NER model will be stored + training_args={ + "batch_size": 8, + "num_train_epochs": 1, + "learning_rate": 0.00005, + "weight_decay": 0.0, + }, overwrite_training=False, # Set to True if you want to overwrite model if existing do_test=False, # Set to True if you want to train on test mode - load_from_hub=False, + load_from_hub=False, ) # Instantiate the ranker: myranker = ranking.Ranker( method="perfectmatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) # -------------------------------------- # Instantiate the linker: mylinker = linking.Linker( method="mostpopular", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) myner.train() @@ -181,9 +193,9 @@ def test_apply(): # Instantiate the experiment: exp = experiment.Experiment( dataset="lwm", - data_path="experiments/outputs/data/", + data_path=os.path.join(current_dir,"sample_files/experiments/outputs/data/"), dataset_df=pd.DataFrame(), - results_path="experiments/outputs/results/", + results_path=str(tmp_path), myner=myner, myranker=myranker, mylinker=mylinker, diff --git a/tests/test_linking.py b/tests/test_linking.py index c233ecc8..30587416 100644 --- a/tests/test_linking.py +++ b/tests/test_linking.py @@ -1,33 +1,19 @@ import os import sqlite3 import sys +from pathlib import Path import numpy as np +import pytest -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from geoparser import linking +from t_res.geoparser import linking +current_dir = Path(__file__).parent.resolve() -def test_initialise_method(): - """ - Test initialisation works fine - """ +def test_linking_most_popular(): mylinker = linking.Linker( method="mostpopular", - resources_path="resources/", - linking_resources=dict(), - rel_params=dict(), - overwrite_training=False, - ) - - assert type(mylinker.__str__()) == str - - -def test_most_popular(): - mylinker = linking.Linker( - method="mostpopular", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params=dict(), overwrite_training=False, @@ -39,8 +25,8 @@ def test_most_popular(): } keep_most_popular, final_score, candidates = mylinker.most_popular(dict_mention) assert keep_most_popular == "Q84" - assert final_score == 0.9895689976719958 - assert candidates == {"Q84": 0.9895689976719958, "Q92561": 0.01043100232800422} + assert final_score == 0.9812731647051174 + assert candidates == {"Q84": 0.9812731647051174, "Q92561": 0.018726835294882633} dict_mention = {"candidates": {}} keep_most_popular, final_score, candidates = mylinker.most_popular(dict_mention) @@ -52,7 +38,7 @@ def test_most_popular(): def test_by_distance(): mylinker = linking.Linker( method="bydistance", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params=dict(), overwrite_training=False, @@ -60,6 +46,7 @@ def test_by_distance(): mylinker.load_resources() + #test it finds London, UK dict_mention = { "candidates": { "London": {"Candidates": {"Q84": 0.9, "Q92561": 0.1}, "Score": 0.397048} @@ -71,20 +58,22 @@ def test_by_distance(): assert final_score == 0.824 assert "Q84" in resulting_cands + #test it finds London, CA dict_mention = { "candidates": { "London": {"Candidates": {"Q84": 0.9, "Q92561": 0.1}, "Score": 0.397048} }, - "place_wqid": "Q172", + "place_wqid": "Q92561", } pred, final_score, resulting_cands = mylinker.by_distance(dict_mention) assert pred == "Q92561" - assert final_score == 0.54 + assert final_score == 0.624 assert "Q84" in resulting_cands + #check it finds none dict_mention = { "candidates": {"London": {"Candidates": {}, "Score": 0.397048}}, - "place_wqid": "Q172", + "place_wqid": "Q2365261", } pred, final_score, resulting_cands = mylinker.by_distance(dict_mention) assert pred == "NIL" diff --git a/tests/test_ner.py b/tests/test_ner.py index b8a9718c..746e636a 100644 --- a/tests/test_ner.py +++ b/tests/test_ner.py @@ -1,190 +1,129 @@ import os -import shutil -import sys +from pathlib import Path +import pytest -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) -import transformers +from transformers.pipelines.token_classification import TokenClassificationPipeline -from geoparser import recogniser -from utils import ner +from t_res.geoparser import recogniser +from t_res.utils import ner +current_dir = Path(__file__).parent.resolve() -def test_training(): - """ - Test that running train() generates a model folder - """ - - test_folder_path = "resources/models/blb_lwm-ner-coarse_test.model" - - if os.path.isdir(test_folder_path): - shutil.rmtree(test_folder_path) - +def test_ner_local_train(tmp_path): + model_path = os.path.join(tmp_path,"ner_test.model") + myner = recogniser.Recogniser( - model="blb_lwm-ner-coarse", # NER model name prefix (will have suffixes appended) - base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_coarse_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_coarse_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + model="ner_test", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + base_model="Livingwithmachines/bert_1760_1900", + model_path=f"{tmp_path}/", training_args={ "batch_size": 8, "num_train_epochs": 10, "learning_rate": 0.00005, "weight_decay": 0.0, }, - overwrite_training=True, # Set to True if you want to overwrite model if existing - do_test=True, # Set to True if you want to train on test mode + overwrite_training=False, + do_test=False, load_from_hub=False, ) - assert os.path.isdir(test_folder_path) == False + assert os.path.exists(model_path) is False myner.train() - assert os.path.isdir(test_folder_path) == True - - -def test_create_pipeline(): - """ - Test that create_pipeline returns a model folder path that exists and an Pipeline object - """ - myner = recogniser.Recogniser( - model="blb_lwm-ner-coarse", # NER model name prefix (will have suffixes appended) - base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored - training_args={ - "batch_size": 8, - "num_train_epochs": 10, - "learning_rate": 0.00005, - "weight_decay": 0.0, - }, - overwrite_training=False, # Set to True if you want to overwrite model if existing - do_test=True, # Set to True if you want to train on test mode - load_from_hub=False, - ) - pipe = myner.create_pipeline() - assert ( - type(pipe) - == transformers.pipelines.token_classification.TokenClassificationPipeline - ) - + print(model_path) + print(os.listdir(tmp_path)) + assert os.path.exists(model_path) is True +@pytest.mark.skip(reason="Needs large model file") def test_ner_predict(): + model_path = os.path.join(current_dir,"sample_files/resources/models/ner_test.model") + assert os.path.isdir(model_path) is True + myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + model="ner_test", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + base_model="Livingwithmachines/bert_1760_1900", + model_path=os.path.join(current_dir,"sample_files/resources/models/"), training_args={ "batch_size": 8, "num_train_epochs": 10, "learning_rate": 0.00005, "weight_decay": 0.0, }, - overwrite_training=False, # Set to True if you want to overwrite model if existing - do_test=False, # Set to True if you want to train on test mode - load_from_hub=False, + overwrite_training=False, + do_test=False, + load_from_hub=False, # Whether the final model should be loaded from the HuggingFace hub" ) myner.pipe = myner.create_pipeline() + assert isinstance(myner.pipe, TokenClassificationPipeline) - preds = myner.ner_predict( - "I grew up in Bologna, a city near Florence, but way more interesting." - ) - assert type(preds) == list - assert (type(preds[0])) == dict - assert len(preds) == 16 - assert preds[4]["entity"] == "B-LOC" - assert preds[4]["score"] == 0.9994915723800659 + sentence = "A remarkable case of rattening has just occurred in the building trade at Sheffield." + predictions = myner.ner_predict(sentence) + assert isinstance(predictions, list) + assert len(predictions) == 15 + assert predictions[13] == {'entity': 'B-LOC', 'score': 0.7941257357597351, 'word': 'Sheffield', 'start': 74, 'end': 83} # Test that ner_predict() can handle hyphens - preds = myner.ner_predict("- I grew up in Plymouth—Kingston.") - assert preds[0]["word"] == "-" - assert preds[6]["word"] == "," + sentence = "- I grew up in Plymouth—Kingston." + predictions = myner.ner_predict(sentence) + assert predictions[0]["word"] == "-" + assert predictions[6]["word"] == "," -def test_ner_load_from_hub(): +def test_ner_from_hub(): myner = recogniser.Recogniser( model="Livingwithmachines/toponym-19thC-en", load_from_hub=True, ) - pipe = myner.create_pipeline() - assert ( - type(pipe) - == transformers.pipelines.token_classification.TokenClassificationPipeline - ) + myner.train() + myner.pipe = myner.create_pipeline() + assert isinstance(myner.pipe, TokenClassificationPipeline) + + sentence = "A remarkable case of rattening has just occurred in the building trade at Sheffield." + predictions = myner.ner_predict(sentence) + assert isinstance(predictions, list) + assert len(predictions) == 15 + assert predictions[13] == {'entity': 'B-LOC', 'score': 0.9996446371078491, 'word': 'Sheffield', 'start': 74, 'end': 83} def test_aggregate_mentions(): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # NER model name prefix (will have suffixes appended) - base_model="Livingwithmachines/bert_1760_1900", # Base model to fine-tune (from huggingface) - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored - training_args={ - "batch_size": 8, - "num_train_epochs": 10, - "learning_rate": 0.00005, - "weight_decay": 0.0, - }, - overwrite_training=False, # Set to True if you want to overwrite model if existing - do_test=False, # Set to True if you want to train on test mode - load_from_hub=False, + model="Livingwithmachines/toponym-19thC-en", + load_from_hub=True, ) myner.pipe = myner.create_pipeline() - + sentence = "I grew up in Bologna, a city near Florence, but way more interesting." predictions = myner.ner_predict(sentence) # Process predictions: procpreds = [ - [x["word"], x["entity"], "O", x["start"], x["end"], x["score"]] + [x["word"], x["entity"], "O", x["start"], x["end"]] for x in predictions ] # Aggregate mentions: mentions = ner.aggregate_mentions(procpreds, "pred") - assert mentions[0]["mention"] == "Bologna" + assert len(mentions) == 2 assert mentions[1]["mention"] == "Florence" + assert mentions[0] == {'mention': 'Bologna', 'start_offset': 4, 'end_offset': 4, 'start_char': 13, 'end_char': 20, 'ner_score': 20.0, 'ner_label': 'LOC', 'entity_link': 'O'} assert mentions[0]["end_char"] - mentions[0]["start_char"] == len( mentions[0]["mention"] ) - assert mentions[1]["end_char"] - mentions[1]["start_char"] == len( - mentions[1]["mention"] - ) assert mentions[0]["mention"] in sentence - assert mentions[1]["mention"] in sentence - - sentence = "I grew up in New York City, a city in the United States." - predictions = myner.ner_predict(sentence) - # Process predictions: - procpreds = [ - [x["word"], x["entity"], "O", x["start"], x["end"], x["score"]] - for x in predictions - ] - # Aggregate mentions: - mentions = ner.aggregate_mentions(procpreds, "pred") - assert mentions[0]["mention"] == "New York City" - assert mentions[1]["mention"] == "United States" - assert mentions[0]["end_char"] - mentions[0]["start_char"] == len( - mentions[0]["mention"] - ) - assert mentions[1]["end_char"] - mentions[1]["start_char"] == len( - mentions[1]["mention"] - ) - assert mentions[0]["mention"] in sentence - assert mentions[1]["mention"] in sentence sentence = "ARMITAGE, DEM’TIST, may be consulted dally, from 9 a.m., till 8 p.m., at his residence, 95, STAMFORP-9TKEET, Ashton-cnder-Ltne." predictions = myner.ner_predict(sentence) # Process predictions: procpreds = [ - [x["word"], x["entity"], "O", x["start"], x["end"], x["score"]] + [x["word"], x["entity"], "O", x["start"], x["end"]] for x in predictions ] # Aggregate mentions: mentions = ner.aggregate_mentions(procpreds, "pred") - assert mentions[-1]["mention"] == "Ashton-cnder-Ltne" - for i in range(len(mentions)): - assert mentions[i]["end_char"] - mentions[i]["start_char"] == len( - mentions[i]["mention"] + assert len(mentions) == 2 + assert mentions[1]["mention"] == "Ashton-cnder-Ltne" + assert mentions[0] == {'mention': 'STAMFORP-9TKEET', 'start_offset': 31, 'end_offset': 33, 'start_char': 92, 'end_char': 107, 'ner_score': 102.667, 'ner_label': 'STREET', 'entity_link': 'O'} + assert mentions[0]["end_char"] - mentions[0]["start_char"] == len( + mentions[0]["mention"] ) - assert mentions[i]["mention"] in sentence + assert mentions[0]["mention"] in sentence diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 19643665..274d7267 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,23 +1,57 @@ import os import sqlite3 -import sys from pathlib import Path -sys.path.insert(0, os.path.abspath(os.path.pardir)) -from geoparser import linking, pipeline, ranking, recogniser +import pytest +from t_res.geoparser import linking, pipeline, ranking, recogniser -def test_deezy_mostpopular(): +current_dir = Path(__file__).parent.resolve() + +def test_pipeline_basic(): + geoparser = pipeline.Pipeline( + resources_path=os.path.join(current_dir,"sample_files/resources") + ) + + sentence = "A remarkable case of rattening has just occurred in the building trade at Sheffield." + resolved = geoparser.run_text(sentence) + assert len(resolved)==1 + assert resolved[0]["mention"]=="Sheffield" + assert resolved[0]["ner_score"]==1.0 + assert resolved[0]["prediction"]=="Q42448" + +def test_pipeline_modular(): + myranker = ranking.Ranker( + method="perfectmatch", + resources_path=os.path.join(current_dir,"sample_files/resources"), + ) + + mylinker = linking.Linker( + method="mostpopular", + resources_path=os.path.join(current_dir,"sample_files/resources/"), + ) + + geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker) + + sentence = "A remarkable case of rattening has just occurred in the building trade at Sheffield." + resolved = geoparser.run_text(sentence) + assert len(resolved)==1 + assert resolved[0]["mention"]=="Sheffield" + assert resolved[0]["ner_score"]==1.0 + assert resolved[0]["prediction"]=="Q42448" + +@pytest.mark.deezy(reason="Needs deezy model") +def test_deezy_mostpopular(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -28,7 +62,7 @@ def test_deezy_mostpopular(): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -37,13 +71,13 @@ def test_deezy_mostpopular(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": os.path.join(tmp_path,"resources/models/"), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -60,14 +94,16 @@ def test_deezy_mostpopular(): mylinker = linking.Linker( method="mostpopular", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) geoparser = pipeline.Pipeline(myner=myner, myranker=myranker, mylinker=mylinker) + assert len(geoparser.myranker.mentions_to_wikidata.keys())>0 resolved = geoparser.run_text( - "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!", + "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.", ) + assert len(resolved) == 3 assert resolved[0]["mention"] == "Shefiield" assert resolved[0]["prior_cand_score"] == dict() assert resolved[0]["cross_cand_score"]["Q42448"] == 0.903 @@ -84,20 +120,19 @@ def test_deezy_mostpopular(): # asserting behaviour with • character resolved = geoparser.run_text( - " • - ST G pOllO-P• FERRIS - • - , i ", + " • - S G pOllO-P• FERRIS - • - , i ", ) - assert resolved == [] - -def test_deezy_rel_wpubl_wmtops(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_deezy_rel_wpubl_wmtops(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "batch_size": 8, "num_train_epochs": 10, @@ -113,7 +148,7 @@ def test_deezy_rel_wpubl_wmtops(): # Instantiate the ranker: myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -122,13 +157,13 @@ def test_deezy_rel_wpubl_wmtops(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -143,15 +178,15 @@ def test_deezy_rel_wpubl_wmtops(): }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/"), "training_split": "originalsplit", "db_embeddings": cursor, "with_publication": True, @@ -166,11 +201,12 @@ def test_deezy_rel_wpubl_wmtops(): geoparser = pipeline.Pipeline(myner=myner, myranker=myranker, mylinker=mylinker) resolved = geoparser.run_text( - "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!", + "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.", place="Sheffield", place_wqid="Q42448", ) + assert len(resolved) == 3 assert resolved[0]["mention"] == "Shefiield" assert resolved[0]["prior_cand_score"]["Q42448"] == 0.891 assert resolved[0]["cross_cand_score"]["Q42448"] == 0.576 @@ -178,18 +214,18 @@ def test_deezy_rel_wpubl_wmtops(): assert resolved[0]["ed_score"] == 0.039 assert resolved[0]["ner_score"] == 1.0 - -def test_perfect_rel_wpubl_wmtops(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_perfect_rel_wpubl_wmtops(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -202,7 +238,7 @@ def test_perfect_rel_wpubl_wmtops(): # Instantiate the ranker: myranker = ranking.Ranker( method="perfectmatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -211,13 +247,13 @@ def test_perfect_rel_wpubl_wmtops(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -232,20 +268,20 @@ def test_perfect_rel_wpubl_wmtops(): }, ) - with sqlite3.connect("resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "resources/models/disambiguation/", - "data_path": "experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/"), "training_split": "originalsplit", "db_embeddings": cursor, "with_publication": True, "without_microtoponyms": True, - "do_test": True, + "do_test": False, "default_publname": "United Kingdom", "default_publwqid": "Q145", }, @@ -255,7 +291,7 @@ def test_perfect_rel_wpubl_wmtops(): geoparser = pipeline.Pipeline(myner=myner, myranker=myranker, mylinker=mylinker) resolved = geoparser.run_text( - "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!", + "A remarkable case of rattening has just occurred in the building trade at Shefiield, but also in Leeds. Not in London though.", place="Sheffield", place_wqid="Q42448", ) @@ -267,18 +303,18 @@ def test_perfect_rel_wpubl_wmtops(): assert resolved[0]["ed_score"] == 0.0 assert resolved[0]["ner_score"] == 1.0 - -def test_modular_deezy_rel(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_modular_deezy_rel(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "batch_size": 8, - "num_train_epochs": 10, + "num_train_epochs": 1, "learning_rate": 0.00005, "weight_decay": 0.0, }, @@ -287,22 +323,26 @@ def test_modular_deezy_rel(): load_from_hub=False, # Bool: True if model is in HuggingFace hub ) + # -------------------------------------- + # Instantiate the ranker: myranker = ranking.Ranker( method="deezymatch", - resources_path="./resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), + mentions_to_wikidata=dict(), + wikidata_to_mentions=dict(), strvar_parameters={ # Parameters to create the string pair dataset: "ocr_threshold": 60, "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": str(Path("./resources/models/w2v/").resolve()), - "w2v_ocr_model": "w2v_*_news", + "w2v_ocr_path": str(tmp_path), + "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("./resources/deezymatch/").resolve()), + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -317,15 +357,15 @@ def test_modular_deezy_rel(): }, ) - with sqlite3.connect("./resources/rel_db/embeddings_database.db") as conn: + with sqlite3.connect(os.path.join(current_dir,"sample_files/resources/rel_db/embeddings_database.db")) as conn: cursor = conn.cursor() mylinker = linking.Linker( method="reldisamb", - resources_path="./resources/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), linking_resources=dict(), rel_params={ - "model_path": "./resources/models/disambiguation/", - "data_path": "./experiments/outputs/data/lwm/", + "model_path": os.path.join(current_dir,"sample_files/resources/models/disambiguation/"), + "data_path": os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/"), "training_split": "apply", "db_embeddings": cursor, "with_publication": True, diff --git a/tests/test_process_data.py b/tests/test_process_data.py index 671c85ca..9d5fb683 100644 --- a/tests/test_process_data.py +++ b/tests/test_process_data.py @@ -1,15 +1,14 @@ import os import sys +from pathlib import Path import pandas as pd import pytest -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) - -from geoparser import recogniser -from utils import process_data +from t_res.geoparser import recogniser +from t_res.utils import process_data +current_dir = Path(__file__).parent.resolve() def test_eval_with_exception(): # test normal behaviour @@ -18,10 +17,8 @@ def test_eval_with_exception(): list_of_dict = process_data.eval_with_exception(str_list_of_dict) assert list_of_dict != str_list_of_dict - - assert type(list_of_dict) == list - - assert type(list_of_dict[0]) == dict + assert isinstance(list_of_dict,list) + assert isinstance(list_of_dict[0],dict) # test that it returns "" if the input is None @@ -32,17 +29,14 @@ def test_eval_with_exception(): # test that it raises an error if the syntax is wrong str_list_of_dict = "[{'key_1': 1, 'key_2': 2}" - check = False + with pytest.raises(SyntaxError) as cm: - check = True process_data.eval_with_exception(str_list_of_dict) - assert check == True - def test_prepare_sents(): dataset_df = pd.read_csv( - "experiments/outputs/data/lwm/linking_df_split.tsv", + os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"), sep="\t", ) @@ -53,7 +47,7 @@ def test_prepare_sents(): dAnnotated, dSentences, dMetadata = process_data.prepare_sents(dataset_df) - assert dAnnotated["4428937_4"][(26, 41)] == ("LOC", "Bt. Jamess Park", "Q216914") + assert dAnnotated["3580760_2"][(0, 6)] == ('LOC', 'LONDON', 'Q84') test_data = process_data.eval_with_exception(dataset_df["annotations"][0]) test_data[0]["wkdt_qid"] = "*" @@ -62,7 +56,7 @@ def test_prepare_sents(): dAnnotated, dSentences, dMetadata = process_data.prepare_sents(dataset_df) - assert dAnnotated["4428937_4"][(26, 41)] == ("LOC", "Bt. Jamess Park", "Q216914") + assert dAnnotated["3580760_2"][(0, 6)] == ('LOC', 'LONDON', 'Q84') assert len(dAnnotated) == len(dSentences) == len(dMetadata) @@ -70,35 +64,37 @@ def test_prepare_sents(): assert len([x for x, y in dMetadata.items() if len(y) == 0]) == 0 -def test_align_gold(): +def test_align_gold(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "learning_rate": 5e-5, "batch_size": 16, - "num_train_epochs": 4, + "num_train_epochs": 1, "weight_decay": 0.01, }, overwrite_training=False, # Set to True if you want to overwrite model if existing do_test=False, # Set to True if you want to train on test mode + load_from_hub=False, # Bool: True if model is in HuggingFace hub ) + myner.train() myner.pipe = myner.create_pipeline() dataset_df = pd.read_csv( - "experiments/outputs/data/lwm/linking_df_split.tsv", + os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"), sep="\t", ) dAnnotated, dSentences, dMetadata = process_data.prepare_sents(dataset_df) empty_list = [] for sent_id in dSentences.keys(): - if "4935585_1" == sent_id: + if "3580760_2" == sent_id: sent = dSentences[sent_id] annotations = dAnnotated[sent_id] predictions = myner.ner_predict(sent) @@ -125,27 +121,30 @@ def test_align_gold(): assert len(empty_list) == 0 -def test_ner_and_process(): +def test_ner_and_process(tmp_path): myner = recogniser.Recogniser( - model="blb_lwm-ner-fine", # We'll store the NER model here - pipe=None, # We'll store the NER pipeline here + model="blb_lwm-ner-fine", + train_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_train.json"), + test_dataset=os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/ner_fine_dev.json"), + pipe=None, base_model="khosseini/bert_1760_1900", # Base model to fine-tune - train_dataset="experiments/outputs/data/lwm/ner_fine_train.json", # Training set (part of overall training set) - test_dataset="experiments/outputs/data/lwm/ner_fine_dev.json", # Test set (part of overall training set) - model_path="resources/models/", # Path where the NER model is or will be stored + model_path=str(tmp_path), # Path where the NER model will be stored training_args={ "learning_rate": 5e-5, "batch_size": 16, - "num_train_epochs": 4, + "num_train_epochs": 1, "weight_decay": 0.01, }, overwrite_training=False, # Set to True if you want to overwrite model if existing do_test=False, # Set to True if you want to train on test mode + load_from_hub=False, # Bool: True if model is in HuggingFace hub ) + + myner.train() myner.pipe = myner.create_pipeline() dataset_df = pd.read_csv( - "experiments/outputs/data/lwm/linking_df_split.tsv", + os.path.join(current_dir,"sample_files/experiments/outputs/data/lwm/linking_df_split.tsv"), sep="\t", ) diff --git a/tests/test_ranking.py b/tests/test_ranking.py index 5f3dae5e..9becf11c 100644 --- a/tests/test_ranking.py +++ b/tests/test_ranking.py @@ -1,39 +1,23 @@ -import json import os -import sys from pathlib import Path import pytest -from DeezyMatch import candidate_ranker -# Add "../" to path to import utils -sys.path.insert(0, os.path.abspath(os.path.pardir)) +from t_res.geoparser import ranking -from geoparser import ranking +current_dir = Path(__file__).parent.resolve() - -def test_initialise_method(): - """ - Test initialisation works fine - """ - myranker = ranking.Ranker( - method="perfectmatch", - resources_path="resources/wikidata/", - mentions_to_wikidata=dict(), - wikidata_to_mentions=dict(), - ) - assert type(myranker.__str__()) == str - - -def test_perfect_match(): +def test_ranking_perfect_match(): """ Test that perfect_match returns only perfect matching cases """ myranker = ranking.Ranker( method="perfectmatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) + myranker.mentions_to_wikidata = myranker.load_resources() + myranker.already_collected_cands = {} candidates, already_collected_cands = myranker.perfect_match(["London"]) assert candidates["London"]["London"] == 1.0 @@ -44,107 +28,92 @@ def test_perfect_match(): assert candidates["Paperopoli"] == {} -def test_damlev(): +def test_ranking_damlev(): """ Test that damlev returns correctly """ myranker = ranking.Ranker( method="partialmatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) + score = myranker.damlev_dist("Lvndon", {"mentions": "London"}) assert score == 0.8333333283662796 + score = myranker.damlev_dist("uityity", {"mentions": "asdasd"}) + assert score == 0.0 + with pytest.raises(TypeError): - found = True myranker.damlev_dist("Lvndon", "London") - assert found == True - assert 0.0 == myranker.damlev_dist("uityity", {"mentions": "asdasd"}) - -def test_check_if_contained(): +def test_ranking_check_if_contained(): """ Test that check_if_contained returns score only when there is an overlap """ myranker = ranking.Ranker( method="partialmatch", - resources_path="resources/wikidata/", - mentions_to_wikidata=dict(), - wikidata_to_mentions=dict(), + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) + score_a = myranker.check_if_contained("New York", {"mentions": "New York City"}) score_b = myranker.check_if_contained("New York City", {"mentions": "New York"}) - assert score_a == score_b == 0.6153846153846154 with pytest.raises(TypeError): - found = True myranker.check_if_contained("Lvndon", "London") - assert found == True - assert None == myranker.check_if_contained("London", {"mentions": "New York"}) + score = myranker.check_if_contained("London", {"mentions": "New York"}) + assert score is None -def test_partial_match(): +def test_ranking_partial_match(): """ Test that partial match either returns results or {} """ myranker = ranking.Ranker( method="partialmatch", - resources_path="resources/wikidata/", - mentions_to_wikidata=dict(), - wikidata_to_mentions=dict(), + resources_path=os.path.join(current_dir,"sample_files/resources/"), ) myranker.mentions_to_wikidata = myranker.load_resources() # Test that perfect_match acts before partial match myranker.mentions_to_wikidata = {"London": "Q84"} - candidates, already_collected_cands = myranker.partial_match( - ["London"], damlev=False - ) + myranker.already_collected_cands = {} + candidates, already_collected_cands = myranker.partial_match(["London"], damlev=False) assert candidates["London"]["London"] == 1.0 # Test that damlev works myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.partial_match( - ["Lvndvn"], damlev=True - ) + candidates, already_collected_cands = myranker.partial_match(["Lvndvn"], damlev=True) assert candidates["Lvndvn"]["London"] == 0.6666666567325592 # Test that overlap works properly myranker.mentions_to_wikidata = {"New York City": "Q60"} myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.partial_match( - ["New York"], damlev=False - ) + candidates, already_collected_cands = myranker.partial_match(["New York"], damlev=False) assert candidates["New York"]["New York City"] == 0.6153846153846154 - myranker.mentions_to_wikidata = {"New York City": "Q60"} myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.partial_match( - ["Lvndvn"], damlev=False - ) + candidates, already_collected_cands = myranker.partial_match(["Lvndvn"], damlev=False) assert candidates["Lvndvn"] == {} myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.partial_match( - ["asdasd"], damlev=True - ) + candidates, already_collected_cands = myranker.partial_match(["asdasd"], damlev=True) assert candidates["asdasd"] == {"New York City": 0.0} - -def test_deezy_on_the_fly(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_ranking_deezy_on_the_fly(tmp_path): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -153,13 +122,13 @@ def test_deezy_on_the_fly(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": "resources/models/", + "w2v_ocr_path": str(tmp_path), "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": "resources/deezymatch/", + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -171,7 +140,7 @@ def test_deezy_on_the_fly(): "verbose": False, # DeezyMatch training: "overwrite_training": False, - "do_test": True, + "do_test": False, }, ) @@ -182,20 +151,14 @@ def test_deezy_on_the_fly(): # Test that deezy works myranker.already_collected_cands = {} + candidates, already_collected_cands = myranker.deezy_on_the_fly(["Ashton-cnderLyne"]) + assert (0.0 < candidates["Ashton-cnderLyne"]["Ashton-under-Lyne"] < 1.0) - candidates, already_collected_cands = myranker.deezy_on_the_fly( - ["Ashton-cnderLyne"] - ) - assert ( - candidates["Ashton-cnderLyne"]["Ashton-under-Lyne"] > 0.0 - and candidates["Ashton-cnderLyne"]["Ashton-under-Lyne"] < 1.0 - ) - - -def test_find_candidates(): +@pytest.mark.deezy(reason="Needs deezy model") +def test_ranking_find_candidates(tmp_path): myranker = ranking.Ranker( method="deezymatch", - resources_path="resources/wikidata/", + resources_path=os.path.join(current_dir,"sample_files/resources/"), mentions_to_wikidata=dict(), wikidata_to_mentions=dict(), strvar_parameters={ @@ -204,13 +167,13 @@ def test_find_candidates(): "top_threshold": 85, "min_len": 5, "max_len": 15, - "w2v_ocr_path": "resources/models/", + "w2v_ocr_path": str(tmp_path), "w2v_ocr_model": "w2v_1800s_news", "overwrite_dataset": False, }, deezy_parameters={ # Paths and filenames of DeezyMatch models and data: - "dm_path": "resources/deezymatch/", + "dm_path": os.path.join(current_dir,"sample_files/resources/deezymatch/"), "dm_cands": "wkdtalts", "dm_model": "w2v_ocr", "dm_output": "deezymatch_on_the_fly", @@ -228,22 +191,14 @@ def test_find_candidates(): # Test that perfect_match acts before deezy myranker.mentions_to_wikidata = myranker.load_resources() - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "London"}] - ) + candidates, already_collected_cands = myranker.find_candidates([{"mention": "London"}]) assert candidates["London"]["London"]["Score"] == 1.0 assert "Q84" in candidates["London"]["London"]["Candidates"] # Test that deezy works myranker.already_collected_cands = {} - - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheftield"}] - ) - assert ( - candidates["Sheftield"]["Sheffield"]["Score"] > 0.0 - and candidates["Sheftield"]["Sheffield"]["Score"] < 1.0 - ) + candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheftield"}]) + assert (0.0 < candidates["Sheftield"]["Sheffield"]["Score"] < 1.0) assert "Q42448" in candidates["Sheftield"]["Sheffield"]["Candidates"] # Test that Perfect Match works @@ -251,17 +206,12 @@ def test_find_candidates(): # Test that perfect_match acts before deezy myranker.mentions_to_wikidata = myranker.load_resources() - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheffield"}] - ) + candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheffield"}]) assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0 assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"] myranker.already_collected_cands = {} - - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheftield"}] - ) + candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheftield"}]) assert candidates["Sheftield"] == {} # Test that check if contained works @@ -270,17 +220,13 @@ def test_find_candidates(): # Test that perfect_match acts before partialmatch myranker.mentions_to_wikidata = myranker.load_resources() - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheffield"}] - ) + candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheffield"}]) assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0 assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"] myranker.already_collected_cands = {} - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheftield"}] - ) + candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheftield"}]) assert "Sheffield" not in candidates["Sheftield"] # Test that levenshtein works @@ -289,62 +235,11 @@ def test_find_candidates(): # Test that perfect_match acts before partialmatch myranker.mentions_to_wikidata = myranker.load_resources() - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheffield"}] - ) + candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheffield"}]) assert candidates["Sheffield"]["Sheffield"]["Score"] == 1.0 assert "Q42448" in candidates["Sheffield"]["Sheffield"]["Candidates"] myranker.already_collected_cands = {} - - candidates, already_collected_cands = myranker.find_candidates( - [{"mention": "Sheftield"}] - ) - assert ( - candidates["Sheftield"]["Sheffield"]["Score"] > 0.0 - and candidates["Sheftield"]["Sheffield"]["Score"] < 1.0 - ) - assert "Q42448" in candidates["Sheftield"]["Sheffield"]["Candidates"] - - -def test_deezy_candidate_ranker(): - deezy_parameters = { - # Paths and filenames of DeezyMatch models and data: - "dm_path": str(Path("resources/deezymatch/").resolve()), - "dm_cands": "wkdtalts", - "dm_model": "w2v_ocr", - "dm_output": "deezymatch_on_the_fly", - # Ranking measures: - "ranking_metric": "faiss", - "selection_threshold": 50, - "num_candidates": 1, - "verbose": False, - # DeezyMatch training: - "overwrite_training": False, - "do_test": False, - } - - dm_path = deezy_parameters["dm_path"] - dm_cands = deezy_parameters["dm_cands"] - dm_model = deezy_parameters["dm_model"] - dm_output = deezy_parameters["dm_output"] - - query = ["-", "ST G", "• - , i", "- P", "• FERRIS"] - - candidates = candidate_ranker( - candidate_scenario=os.path.join(dm_path, "combined", dm_cands + "_" + dm_model), - query=query, - ranking_metric=deezy_parameters["ranking_metric"], - selection_threshold=deezy_parameters["selection_threshold"], - num_candidates=deezy_parameters["num_candidates"], - search_size=deezy_parameters["num_candidates"], - verbose=deezy_parameters["verbose"], - output_path=os.path.join(dm_path, "ranking", dm_output), - pretrained_model_path=os.path.join( - f"{dm_path}", "models", f"{dm_model}", f"{dm_model}" + ".model" - ), - pretrained_vocab_path=os.path.join( - f"{dm_path}", "models", f"{dm_model}", f"{dm_model}" + ".vocab" - ), - ) - assert len(candidates) == len(query) + candidates, already_collected_cands = myranker.find_candidates([{"mention": "Sheftield"}]) + assert (0.0 < candidates["Sheftield"]["Sheffield"]["Score"] < 1.0) + assert "Q42448" in candidates["Sheftield"]["Sheffield"]["Candidates"] \ No newline at end of file diff --git a/tests/test_wiki_functions.py b/tests/test_wiki_functions.py index 74948c40..efc96510 100644 --- a/tests/test_wiki_functions.py +++ b/tests/test_wiki_functions.py @@ -1,6 +1,8 @@ import urllib -from utils import process_wikipedia +import pytest + +from t_res.utils import process_wikipedia def test_make_links_consistent(): @@ -15,7 +17,7 @@ def test_make_links_consistent(): assert (process_wikipedia.make_wikilinks_consistent(string_a) == string_a) is False assert process_wikipedia.make_wikilinks_consistent(string_c) == "new%20york" - +@pytest.mark.skip(reason="Needs large db file") def test_wikidata2wikipedia(): db = "resources/wikipedia/index_enwiki-latest.db" assert process_wikipedia.title_to_id("BOLOGNA", lower=True, path_to_db=db) == None