diff --git a/_sources/getting-started/installation.rst.txt b/_sources/getting-started/installation.rst.txt
index 8a5bdac..290a518 100644
--- a/_sources/getting-started/installation.rst.txt
+++ b/_sources/getting-started/installation.rst.txt
@@ -113,12 +113,19 @@ To add a package:
 
     $ poetry add [package name]
 
-To run the Python tests:
+To run the Python unit tests:
 
 .. code-block:: bash
 
     $ poetry run pytest
 
+To run unit and integration tests, some of which depend on the `T-Res resources <../getting-started/resources.html>`_:
+
+.. code-block:: bash
+
+    $ poetry run pytest tests --no-skip
+
+
 If you want to use Jupyter notebook, run it as usual, and then select the
 created kernel in "Kernel" > "Change kernel".
 
diff --git a/getting-started/installation.html b/getting-started/installation.html
index 2628936..51ac8c5 100644
--- a/getting-started/installation.html
+++ b/getting-started/installation.html
@@ -174,10 +174,14 @@ <h2>How to use poetry<a class="headerlink" href="#how-to-use-poetry" title="Perm
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$<span class="w"> </span>poetry<span class="w"> </span>add<span class="w"> </span><span class="o">[</span>package<span class="w"> </span>name<span class="o">]</span>
 </pre></div>
 </div>
-<p>To run the Python tests:</p>
+<p>To run the Python unit tests:</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$<span class="w"> </span>poetry<span class="w"> </span>run<span class="w"> </span>pytest
 </pre></div>
 </div>
+<p>To run unit and integration tests, some of which depend on the <a class="reference external" href="../getting-started/resources.html">T-Res resources</a>:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$<span class="w"> </span>poetry<span class="w"> </span>run<span class="w"> </span>pytest<span class="w"> </span>tests<span class="w"> </span>--no-skip
+</pre></div>
+</div>
 <p>If you want to use Jupyter notebook, run it as usual, and then select the
 created kernel in “Kernel” &gt; “Change kernel”.</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>$<span class="w"> </span>jupyter<span class="w"> </span>notebook
diff --git a/reference/geoparser/ranker.html b/reference/geoparser/ranker.html
index df23fc1..013a724 100644
--- a/reference/geoparser/ranker.html
+++ b/reference/geoparser/ranker.html
@@ -196,7 +196,7 @@ <h1><code class="docutils literal notranslate"><span class="pre">t_res.geoparser
 </div>
 <dl class="py method">
 <dt class="sig sig-object py" id="t_res.geoparser.ranking.Ranker.check_if_contained">
-<span class="sig-name descname"><span class="pre">check_if_contained</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">query</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">row</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.Series.html#pandas.Series" title="(in pandas v3.0.0.dev0+1584.ge3e198f3cf)"><span class="pre">Series</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#float" title="(in Python v3.13)"><span class="pre">float</span></a></span></span><a class="headerlink" href="#t_res.geoparser.ranking.Ranker.check_if_contained" title="Permalink to this definition"></a></dt>
+<span class="sig-name descname"><span class="pre">check_if_contained</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">query</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">row</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.Series.html#pandas.Series" title="(in pandas v3.0.0.dev0+1585.ge3a3a4a5fb)"><span class="pre">Series</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#float" title="(in Python v3.13)"><span class="pre">float</span></a></span></span><a class="headerlink" href="#t_res.geoparser.ranking.Ranker.check_if_contained" title="Permalink to this definition"></a></dt>
 <dd><p>Returns the amount of overlap, if a mention is contained within a row
 in the dataset.</p>
 <dl class="field-list simple">
@@ -229,7 +229,7 @@ <h1><code class="docutils literal notranslate"><span class="pre">t_res.geoparser
 
 <dl class="py method">
 <dt class="sig sig-object py" id="t_res.geoparser.ranking.Ranker.damlev_dist">
-<span class="sig-name descname"><span class="pre">damlev_dist</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">query</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">row</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.Series.html#pandas.Series" title="(in pandas v3.0.0.dev0+1584.ge3e198f3cf)"><span class="pre">Series</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#float" title="(in Python v3.13)"><span class="pre">float</span></a></span></span><a class="headerlink" href="#t_res.geoparser.ranking.Ranker.damlev_dist" title="Permalink to this definition"></a></dt>
+<span class="sig-name descname"><span class="pre">damlev_dist</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">query</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">row</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.Series.html#pandas.Series" title="(in pandas v3.0.0.dev0+1585.ge3a3a4a5fb)"><span class="pre">Series</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#float" title="(in Python v3.13)"><span class="pre">float</span></a></span></span><a class="headerlink" href="#t_res.geoparser.ranking.Ranker.damlev_dist" title="Permalink to this definition"></a></dt>
 <dd><p>Calculate the Damerau-Levenshtein distance between a mention and a row
 in the dataset.</p>
 <dl class="field-list simple">
diff --git a/reference/utils/preprocess_data.html b/reference/utils/preprocess_data.html
index 6ee3901..c106778 100644
--- a/reference/utils/preprocess_data.html
+++ b/reference/utils/preprocess_data.html
@@ -191,7 +191,7 @@ <h1><code class="docutils literal notranslate"><span class="pre">t_res.utils.pre
 </p>
 </dd>
 <dt class="field-odd">Return type<span class="colon">:</span></dt>
-<dd class="field-odd"><p><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.DataFrame.html#pandas.DataFrame" title="(in pandas v3.0.0.dev0+1584.ge3e198f3cf)">pandas.DataFrame</a></p>
+<dd class="field-odd"><p><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.DataFrame.html#pandas.DataFrame" title="(in pandas v3.0.0.dev0+1585.ge3a3a4a5fb)">pandas.DataFrame</a></p>
 </dd>
 </dl>
 <div class="admonition note">
@@ -239,7 +239,7 @@ <h1><code class="docutils literal notranslate"><span class="pre">t_res.utils.pre
 </p>
 </dd>
 <dt class="field-odd">Return type<span class="colon">:</span></dt>
-<dd class="field-odd"><p><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.DataFrame.html#pandas.DataFrame" title="(in pandas v3.0.0.dev0+1584.ge3e198f3cf)">pandas.DataFrame</a></p>
+<dd class="field-odd"><p><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.DataFrame.html#pandas.DataFrame" title="(in pandas v3.0.0.dev0+1585.ge3a3a4a5fb)">pandas.DataFrame</a></p>
 </dd>
 </dl>
 <div class="admonition note">
@@ -351,7 +351,7 @@ <h1><code class="docutils literal notranslate"><span class="pre">t_res.utils.pre
 </p>
 </dd>
 <dt class="field-odd">Return type<span class="colon">:</span></dt>
-<dd class="field-odd"><p><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.DataFrame.html#pandas.DataFrame" title="(in pandas v3.0.0.dev0+1584.ge3e198f3cf)">pandas.DataFrame</a></p>
+<dd class="field-odd"><p><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.DataFrame.html#pandas.DataFrame" title="(in pandas v3.0.0.dev0+1585.ge3a3a4a5fb)">pandas.DataFrame</a></p>
 </dd>
 </dl>
 <div class="admonition note">
diff --git a/reference/utils/process_data.html b/reference/utils/process_data.html
index 908016c..22acaa6 100644
--- a/reference/utils/process_data.html
+++ b/reference/utils/process_data.html
@@ -384,7 +384,7 @@ <h1><code class="docutils literal notranslate"><span class="pre">t_res.utils.pro
 
 <dl class="py function">
 <dt class="sig sig-object py" id="t_res.utils.process_data.update_with_linking">
-<span class="sig-prename descclassname"><span class="pre">t_res.utils.process_data.</span></span><span class="sig-name descname"><span class="pre">update_with_linking</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ner_predictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><span class="pre">dict</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">link_predictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.Series.html#pandas.Series" title="(in pandas v3.0.0.dev0+1584.ge3e198f3cf)"><span class="pre">Series</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><span class="pre">dict</span></a></span></span><a class="headerlink" href="#t_res.utils.process_data.update_with_linking" title="Permalink to this definition"></a></dt>
+<span class="sig-prename descclassname"><span class="pre">t_res.utils.process_data.</span></span><span class="sig-name descname"><span class="pre">update_with_linking</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ner_predictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><span class="pre">dict</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">link_predictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.Series.html#pandas.Series" title="(in pandas v3.0.0.dev0+1585.ge3a3a4a5fb)"><span class="pre">Series</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><span class="pre">dict</span></a></span></span><a class="headerlink" href="#t_res.utils.process_data.update_with_linking" title="Permalink to this definition"></a></dt>
 <dd><p>Updates the NER predictions by incorporating linking results.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -407,7 +407,7 @@ <h1><code class="docutils literal notranslate"><span class="pre">t_res.utils.pro
 
 <dl class="py function">
 <dt class="sig sig-object py" id="t_res.utils.process_data.update_with_skyline">
-<span class="sig-prename descclassname"><span class="pre">t_res.utils.process_data.</span></span><span class="sig-name descname"><span class="pre">update_with_skyline</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ner_predictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><span class="pre">dict</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">link_predictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.Series.html#pandas.Series" title="(in pandas v3.0.0.dev0+1584.ge3e198f3cf)"><span class="pre">Series</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><span class="pre">dict</span></a></span></span><a class="headerlink" href="#t_res.utils.process_data.update_with_skyline" title="Permalink to this definition"></a></dt>
+<span class="sig-prename descclassname"><span class="pre">t_res.utils.process_data.</span></span><span class="sig-name descname"><span class="pre">update_with_skyline</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ner_predictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><span class="pre">dict</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">link_predictions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.Series.html#pandas.Series" title="(in pandas v3.0.0.dev0+1585.ge3a3a4a5fb)"><span class="pre">Series</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><span class="pre">dict</span></a></span></span><a class="headerlink" href="#t_res.utils.process_data.update_with_skyline" title="Permalink to this definition"></a></dt>
 <dd><p>Updates the NER predictions with the skyline link from entity linking.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
diff --git a/reference/utils/rel_utils.html b/reference/utils/rel_utils.html
index 590e07b..851069d 100644
--- a/reference/utils/rel_utils.html
+++ b/reference/utils/rel_utils.html
@@ -253,7 +253,7 @@ <h1><code class="docutils literal notranslate"><span class="pre">t_res.utils.rel
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
-<li><p><strong>df</strong> (<a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.DataFrame.html#pandas.DataFrame" title="(in pandas v3.0.0.dev0+1584.ge3e198f3cf)"><em>pandas.DataFrame</em></a>) – The pandas DataFrame containing the prepared
+<li><p><strong>df</strong> (<a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.DataFrame.html#pandas.DataFrame" title="(in pandas v3.0.0.dev0+1585.ge3a3a4a5fb)"><em>pandas.DataFrame</em></a>) – The pandas DataFrame containing the prepared
 dataset.</p></li>
 <li><p><strong>rel_params</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a>) – Dictionary containing the parameters for performing
 entity disambiguation using the <code class="docutils literal notranslate"><span class="pre">reldisamb</span></code> approach.</p></li>
diff --git a/searchindex.js b/searchindex.js
index b99aaa5..75b6d78 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["experiments/index", "getting-started/complete-tour", "getting-started/index", "getting-started/installation", "getting-started/resources", "index", "reference/geoparser/index", "reference/geoparser/linker", "reference/geoparser/pipeline", "reference/geoparser/ranker", "reference/geoparser/recogniser", "reference/index", "reference/utils/deezy_processing", "reference/utils/get_data", "reference/utils/index", "reference/utils/ner", "reference/utils/preprocess_data", "reference/utils/process_data", "reference/utils/process_wikipedia", "reference/utils/rel/entity_disambiguation", "reference/utils/rel/index", "reference/utils/rel/mulrel_ranker", "reference/utils/rel/utils", "reference/utils/rel/vocabulary", "reference/utils/rel_e2e", "reference/utils/rel_utils", "t-res-api/index", "t-res-api/installation", "t-res-api/usage"], "filenames": ["experiments/index.rst", "getting-started/complete-tour.rst", "getting-started/index.rst", "getting-started/installation.rst", "getting-started/resources.rst", "index.rst", "reference/geoparser/index.rst", "reference/geoparser/linker.rst", "reference/geoparser/pipeline.rst", "reference/geoparser/ranker.rst", "reference/geoparser/recogniser.rst", "reference/index.rst", "reference/utils/deezy_processing.rst", "reference/utils/get_data.rst", "reference/utils/index.rst", "reference/utils/ner.rst", "reference/utils/preprocess_data.rst", "reference/utils/process_data.rst", "reference/utils/process_wikipedia.rst", "reference/utils/rel/entity_disambiguation.rst", "reference/utils/rel/index.rst", "reference/utils/rel/mulrel_ranker.rst", "reference/utils/rel/utils.rst", "reference/utils/rel/vocabulary.rst", "reference/utils/rel_e2e.rst", "reference/utils/rel_utils.rst", "t-res-api/index.rst", "t-res-api/installation.rst", "t-res-api/usage.rst"], "titles": ["Experiments and evaluation", "The complete tour", "Getting started", "Installing T-Res", "Resources and directory structure", "T-Res: A Toponym Resolution Pipeline for Digitised Historical Newspapers", "<code class=\"docutils literal notranslate\"><span class=\"pre\">geoparser</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.geoparser.linking.Linker</span></code>", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.geoparser.pipeline.Pipeline</span></code>", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.geoparser.ranking.</span> <span class=\"pre\">Ranker</span></code>", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.geoparser.recogniser.Recogniser</span></code>", "Reference", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.deezy_processing</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.get_data</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">utils</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.ner</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.preprocess_data</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.process_data</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.process_wikipedia</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.REL.entity_disambiguation</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">utils.REL</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.REL.mulrel_ranker</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.REL.t_res.utils</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.REL.vocabulary</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.rel_e2e</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.rel_utils</span></code> module", "Deploying the T-Res API", "Deploying the T-Res API", "Using the T-Res API"], "terms": {"follow": [0, 1, 3, 4, 8, 9, 15, 16, 17, 18, 26, 27, 28], "step": [0, 4, 8, 17, 19, 27, 28], "reproduc": 0, "our": [0, 1, 3, 4, 16], "paper": [0, 1, 21], "instruct": [0, 3, 4, 8], "directori": [0, 1, 2, 5, 7, 8, 9, 12, 16, 27], "structur": [0, 1, 2, 5, 7, 16, 27], "page": [0, 1, 4, 5, 16, 18, 21], "document": [0, 1, 4, 5, 8, 9, 16, 28], "requir": [0, 1, 4, 7, 8, 9, 12, 17, 19, 24, 25, 27], "To": [0, 1, 3, 4, 27], "creat": [0, 1, 3, 4, 7, 9, 10, 12, 16, 17, 28], "dataset": [0, 8, 9, 10, 12, 13, 15, 19, 24, 25], "we": [0, 1, 3, 4, 5, 15, 17, 19, 25], "us": [0, 2, 4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 21, 23, 24, 25, 26, 27], "present": [0, 18, 19], "command": [0, 3, 27], "from": [0, 4, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 27, 28], "folder": [0, 1, 4, 12, 17], "python": [0, 3, 4, 18, 28], "prepare_data": [0, 25], "py": [0, 3, 25, 26, 27], "p": [0, 1, 7, 19, 20, 21, 23, 27], "thi": [0, 1, 4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27], "script": [0, 3, 4, 16, 20, 23, 25], "take": [0, 1, 8, 9, 10, 15, 16, 25], "care": [0, 1], "download": [0, 1, 4, 13], "lwm": [0, 1, 4, 7, 13, 16, 17], "hipe": [0, 13, 16, 17, 24], "format": [0, 1, 4, 8, 15, 16, 17, 19, 24, 25], "them": [0, 1, 3, 4, 5, 8, 9, 12, 15, 16, 19, 27], "need": [0, 1, 3, 4, 7, 9, 16, 25, 26], "toponym_resolut": [0, 3, 28], "doe": [0, 1, 12, 19], "all": [0, 1, 3, 4, 7, 8, 15, 16, 17], "differ": [0, 1, 4, 7, 9, 15, 25], "scenario": 0, "report": 0, "approach": [0, 1, 4, 7, 9, 21, 24, 25], "tabl": [0, 4], "result": [0, 1, 4, 7, 8, 9, 12, 15, 17, 24], "one": [0, 1, 4, 8, 15, 16, 17], "provid": [0, 1, 4, 5, 7, 8, 9, 10, 12, 15, 17, 25, 26, 27], "go": 0, "There": [0, 15], "you": [0, 1, 3, 4, 8, 27, 28], "should": [0, 1, 4, 7, 15, 19, 23, 27, 28], "clone": [0, 3], "scorer": [0, 17, 24], "ar": [0, 1, 3, 4, 5, 8, 9, 10, 12, 15, 16, 17, 19, 20, 21, 25, 26, 27], "code": [0, 1, 4, 5, 16, 21, 23], "version": [0, 3, 4, 9], "commit": [0, 2, 5], "50dff4e": 0, "have": [0, 1, 3, 4, 17, 19, 20], "ad": [0, 1, 8, 17, 23, 25], "line": [0, 1, 4], "return": [0, 1, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 21, 23, 24, 25], "eval_stat": 0, "end": [0, 4, 5, 8, 10, 15, 16, 17, 24], "get_result": 0, "function": [0, 7, 8, 10, 12, 15, 16, 17, 18, 19, 21, 24, 25], "latex": 0, "display_result": 0, "t": [1, 2, 4, 15, 17], "re": [1, 2, 4], "ha": [1, 4, 5, 8, 9, 17], "three": [1, 4, 5, 8, 15, 17, 24], "main": [1, 5], "class": [1, 4, 5, 7, 8, 9, 10, 16, 19, 21, 23], "which": [1, 3, 4, 7, 8, 9, 15, 17, 19, 23, 24, 26, 27], "perform": [1, 4, 5, 7, 8, 9, 10, 15, 16, 17, 18, 19, 24, 25], "toponym": [1, 2, 8, 10, 16, 28], "recognit": [1, 2, 5, 8, 10, 15, 16, 17], "i": [1, 3, 4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 21, 23, 24, 25, 27, 28], "name": [1, 3, 4, 5, 8, 9, 10, 15, 16, 17, 21, 25, 27], "task": [1, 4, 7, 8, 10, 19], "select": [1, 3, 4, 5, 7, 8, 9, 12], "rank": [1, 5, 6, 7, 8, 11, 17, 21, 25], "identifi": [1, 4, 5, 8, 9, 15, 16, 18, 24, 25], "most": [1, 4, 7, 19], "like": [1, 4, 8, 17, 19], "those": [1, 9, 15], "addit": [1, 10], "wrap": [1, 7], "compon": [1, 5], "therefor": [1, 4], "make": [1, 3, 4, 18], "easier": 1, "user": [1, 3, 4, 26], "link": [1, 4, 5, 6, 8, 9, 11, 15, 16, 17, 18, 21, 24, 25], "In": [1, 3, 4], "section": [1, 4, 27, 28], "depth": 1, "each": [1, 4, 5, 8, 9, 10, 15, 16, 17, 19, 24, 25], "four": [1, 4], "start": [1, 4, 5, 8, 10, 15, 16, 17, 24], "other": [1, 4], "refer": [1, 4, 5, 7, 9, 19, 20, 21, 23], "learn": [1, 4, 19], "more": [1, 4, 7, 9, 17, 23, 28], "about": [1, 4, 9, 17, 25], "also": [1, 3, 4, 5, 8, 9, 10, 15, 17, 26], "first": [1, 3, 4, 8, 9, 12, 15, 17], "try": [1, 8], "run": [1, 3, 4, 5, 7, 8, 9, 10, 24, 27], "default": [1, 4, 7, 8, 9, 10, 12, 17, 19, 23, 25, 27, 28], "chang": [1, 3, 4, 15], "accordingli": [1, 9], "your": [1, 3, 4, 5, 8, 16, 24, 26, 28], "note": [1, 4, 7, 8, 9], "befor": [1, 4, 25, 27], "being": 1, "abl": 1, "sure": [1, 3, 4, 28], "object": [1, 4, 7, 8, 9, 10, 12, 15, 17, 23, 24, 25], "By": [1, 4, 8], "huggingfac": [1, 4, 10, 15], "do": [1, 3, 4], "geopars": [1, 4, 5, 11, 17, 25, 27], "import": [1, 4, 27], "resources_path": [1, 7, 8, 9], "updat": [1, 2, 5, 9, 16, 17, 27], "path": [1, 3, 4, 7, 8, 9, 10, 12, 13, 16, 17, 18, 24, 27], "argument": [1, 10, 12], "reflect": 1, "set": [1, 2, 5, 7, 8, 9, 10, 12, 15, 16, 17, 19, 21, 24, 25, 27], "up": [1, 3, 4, 8, 12, 27], "can": [1, 3, 4, 8, 15, 25, 26, 27], "customis": 1, "see": [1, 4, 7, 8, 9, 15, 17, 19, 20, 21, 23, 28], "order": [1, 3, 4, 7, 16, 17], "just": 1, "beforehand": 1, "pass": [1, 8, 9, 12, 16, 21, 25], "myner": [1, 8, 17], "myrank": [1, 7, 8, 9, 12, 25], "mylink": [1, 7, 8, 25], "expect": [1, 4, 16], "experi": [1, 3, 4, 5, 7, 8, 16, 17, 24, 25, 27], "exampl": [1, 4, 7, 8, 9, 10, 15, 16, 17, 18, 26, 27, 28], "ani": [1, 8, 10, 17, 18, 25], "same": [1, 4, 15, 16, 17], "level": [1, 8, 16], "look": [1, 17], "right": 1, "locat": [1, 4, 7, 12, 16], "If": [1, 3, 4, 7, 8, 9, 10, 12, 15, 16, 17, 18, 25, 28], "itself": [1, 9], "time": [1, 4, 8, 19], "certain": [1, 7], "input": [1, 5, 8, 10, 12, 15, 24, 25], "paramet": [1, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 21, 23, 24, 25], "long": [1, 21], "readi": 1, "predict": [1, 4, 7, 8, 9, 10, 15, 17, 19, 24], "so": 1, "onc": 1, "been": [1, 3, 4, 5, 8, 9, 17, 20], "text": [1, 4, 5, 7, 8, 9, 10, 15, 16, 17, 18, 19, 24, 28], "individu": 1, "list": [1, 4, 8, 9, 10, 12, 15, 16, 17, 19, 24, 25], "two": [1, 4, 8, 9, 12, 15, 16, 17], "run_sent": [1, 8], "method": [1, 4, 7, 8, 9, 10, 17, 19, 21, 23], "appli": [1, 4, 7, 19], "run_text": [1, 8], "split": [1, 4, 7, 8, 9, 15, 17, 24, 25], "sentenc": [1, 4, 8, 9, 10, 15, 16, 17, 24, 28], "inspector": 1, "liddl": 1, "said": 1, "am": 1, "polic": 1, "live": [1, 3, 10], "citi": [1, 4, 8, 16, 18], "durham": 1, "both": [1, 4, 15, 19, 25], "case": [1, 4, 15, 17, 25], "todo": 1, "docstr": 1, "place": [1, 4, 5, 7, 8, 16, 17, 28], "public": [1, 4, 7, 8, 16, 17, 25], "associ": [1, 7, 8, 16, 21, 23], "human": [1, 8], "legibl": [1, 8], "e": [1, 3, 4, 7, 8, 9, 12, 15, 16, 17, 18, 19, 24, 25, 27], "g": [1, 4, 7, 8, 9, 15, 16, 17, 18, 21, 24, 25], "london": [1, 4, 8, 9, 10, 28], "place_wqid": [1, 4, 8, 17, 28], "wikidata": [1, 2, 5, 7, 8, 9, 12, 16, 17, 18, 24, 25], "id": [1, 4, 5, 7, 8, 9, 12, 15, 16, 17, 18, 23, 24, 25], "q84": [1, 4, 7, 8, 9], "For": [1, 4, 7, 8, 9, 10, 15, 17], "alston": 1, "cumbria": 1, "england": [1, 4], "q2560190": 1, "ner_scor": [1, 8, 15, 17], "0": [1, 4, 8, 9, 10, 15, 16, 17, 19, 23, 27, 28], "999": 1, "po": [1, 8], "74": 1, "sent_idx": [1, 8], "end_po": [1, 8], "80": [1, 27], "tag": [1, 8, 15, 16, 17], "loc": [1, 4, 15, 16, 17], "q179815": 1, "ed_scor": [1, 8], "039": 1, "cross_cand_scor": [1, 8], "396": 1, "q23082": 1, "327": 1, "q49229": 1, "141": 1, "q5316459": 1, "049": 1, "q458393": 1, "045": 1, "q17003433": 1, "042": 1, "q1075483": 1, "string_match_scor": [1, 8], "q1137286": 1, "q5316477": 1, "q752266": 1, "prior_cand_scor": [1, 8], "881": 1, "522": 1, "457": 1, "455": 1, "313": 1, "295": 1, "293": 1, "latlon": [1, 8], "54": 1, "783333": 1, "566667": 1, "wkdt_class": [1, 8], "q515": [1, 4], "how": [1, 2, 4, 5, 8, 28], "run_text_recognit": [1, 8], "context": [1, 7, 8, 19, 21], "gold": [1, 8, 15, 17, 21, 24, 25], "none": [1, 4, 7, 8, 9, 10, 12, 13, 16, 17, 18, 19, 21, 23, 24, 25], "ngram": [1, 8], "conf_md": [1, 8], "previou": [1, 8, 15, 16, 24, 28], "ner_output": 1, "cand": 1, "run_candidate_select": [1, 8], "score": [1, 4, 7, 8, 9, 10, 15, 16, 17, 19, 21], "022222222222222223": 1, "3157894736842105": 1, "013513513513513514": 1, "06484443152079093": 1, "final": [1, 4, 8], "disamb_output": 1, "run_disambigu": [1, 8], "exact": [1, 9], "wise": 1, "manner": 1, "regardless": 1, "field": [1, 4, 15, 25], "confid": [1, 7, 8, 10, 19], "posit": [1, 4, 8, 10, 12, 15, 16, 17, 24], "index": [1, 4, 5, 8, 15], "label": [1, 4, 8, 10, 15, 16, 17, 27], "qid": [1, 4, 24], "nil": [1, 7, 24], "A": [1, 4, 7, 8, 9, 10, 12, 15, 16, 17, 19, 23, 24, 25], "dictionari": [1, 4, 7, 8, 9, 10, 12, 15, 16, 17, 24, 25, 27], "match": [1, 4, 8, 9, 12, 16, 24], "prior": [1, 8], "cross": [1, 8], "latitud": [1, 4, 8], "longitud": [1, 4, 8], "coordin": [1, 4, 5, 8], "get": [1, 5, 19, 23, 28], "its": [1, 4, 8, 9, 12, 16, 17, 19, 21, 23], "significantli": 1, "less": 1, "complex": 1, "than": [1, 19], "better": 1, "mai": [1, 15], "bad": 1, "plan": 1, "modern": 1, "global": [1, 3], "clean": 1, "data": [1, 2, 5, 7, 9, 12, 16, 17, 18, 19, 25], "howev": 1, "account": [1, 16], "agnost": 1, "often": 1, "quantitav": 1, "quit": 1, "well": [1, 4], "becaus": [1, 15, 17, 19], "higher": 1, "probabl": 1, "common": [1, 4, 5], "sens": 1, "appear": [1, 4], "consider": 1, "longer": 1, "want": [1, 3, 4], "few": [1, 3], "larg": 1, "number": [1, 4, 9, 16, 23], "done": 1, "effici": 1, "save": [1, 4, 10, 12, 24], "lot": 1, "obtain": [1, 4, 5, 10, 12], "uniqu": [1, 16], "full": [1, 8, 17, 24], "per": [1, 4, 16, 17, 19, 21, 24], "basi": [1, 17], "assum": [1, 4, 16], "csv": 1, "row": [1, 4, 9, 16, 17], "df": [1, 4, 17, 25], "pd": [1, 4, 9, 17], "read_pickl": 1, "1880": 1, "1900": 1, "hmd": 1, "subsampl": 1, "wikidata_id": [1, 4], "find": [1, 4, 8, 9, 24], "datafram": [1, 16, 17, 25], "nlp_df": 1, "identified_toponym": 1, "progress_appli": 1, "lambda": 1, "x": [1, 28], "axi": 1, "whole": 1, "all_toponym": 1, "item": 1, "l": [1, 16], "all_cand": 1, "back": [1, 4], "top": [1, 4, 12, 16], "geograph": [1, 4, 5, 7, 8], "": [1, 4, 7, 8, 9, 15, 27, 28], "transform": [1, 10, 15], "librari": [1, 3, 4], "either": [1, 4, 15, 27], "directli": [1, 3, 4], "hub": [1, 4, 10], "local": [1, 4, 10, 21, 26], "store": [1, 4, 9, 10, 12, 17, 23, 24, 25, 27], "fine": [1, 10, 16], "tune": [1, 10], "new": [1, 4, 8, 12, 15, 16, 18, 25, 27], "base": [1, 2, 5, 7, 9, 10, 12, 15, 16, 17, 19, 21, 23, 24], "alreadi": [1, 4, 7, 8, 9, 10], "pre": [1, 2, 4, 5, 10], "notebook": [1, 3, 28], "detect": [1, 4, 15, 16, 17], "train_use_ner_model": 1, "ipynb": [1, 28], "load_use_ner_model": 1, "load_from_hub": [1, 8, 10], "true": [1, 4, 7, 8, 9, 10, 12, 15, 27], "livingwithmachin": [1, 8], "19thc": [1, 4, 8], "en": [1, 8, 16, 18], "initialis": [1, 7, 8, 9, 10], "wai": [1, 17], "let": 1, "suppos": 1, "rel": [1, 4, 5, 7, 9, 11, 14, 24, 25], "blb_lwm": 1, "could": [1, 4, 18], "notic": [1, 7, 19, 20, 21, 23], "still": 1, "would": [1, 18, 19], "load_from_path": 1, "altern": [1, 4, 9], "below": [1, 4, 7, 8, 9, 12], "train_dataset": [1, 10], "ner_fine_train": [1, 4], "json": [1, 10, 17, 25, 28], "test_dataset": [1, 10], "ner_fine_dev": [1, 4], "base_model": [1, 10], "bert_1760_1900": 1, "model_path": [1, 7, 10], "training_arg": [1, 10], "batch_siz": [1, 10], "8": [1, 10, 16], "num_train_epoch": [1, 10], "10": [1, 4, 10, 16], "learning_r": [1, 10], "00005": [1, 10], "weight_decai": [1, 10], "overwrite_train": [1, 7, 9, 10, 12], "fals": [1, 4, 7, 8, 9, 10, 18, 19, 23], "do_test": [1, 7, 9, 10], "indic": [1, 4, 7, 9, 16, 23], "whether": [1, 4, 7, 8, 9, 10, 17, 23], "prepar": [1, 5, 17, 25], "unless": [1, 17], "even": 1, "specifi": [1, 4, 7, 8, 9, 10, 12, 23, 25], "bert": [1, 4, 10, 17], "nineteenth": 1, "centuri": [1, 4], "test": [1, 3, 4, 7, 9, 10, 17, 25, 27], "necessari": [1, 4, 7, 21], "inform": [1, 4, 7, 9, 15, 16, 17, 19, 20, 21, 23, 24, 25], "where": [1, 4, 9, 12, 13, 15, 17, 19, 24], "rate": 1, "batch": [1, 21], "size": [1, 23], "epoch": 1, "weight": 1, "decai": 1, "allow": [1, 4, 26], "mock": 1, "suffix": [1, 17], "_test": 1, "load_to_hub": 1, "skip": [1, 4, 7, 9, 10], "call": [1, 4, 9, 12, 15, 27], "taken": [1, 19, 20, 21, 23], "knowledg": [1, 7, 9, 19, 24], "accord": [1, 17, 27, 28], "similar": [1, 8, 9, 12, 17], "target": [1, 8], "subset": [1, 8], "next": [1, 4, 8], "gazett": [1, 16, 24], "combin": [1, 5, 15], "wikipedia": [1, 2, 5, 7, 9, 12, 16, 18, 24], "strategi": 1, "ident": 1, "wiltshir": [1, 4], "q23183": [1, 4], "q55448990": [1, 4], "q8023421": [1, 4], "anchor": [1, 4], "partial": [1, 9], "between": [1, 4, 9, 12, 19, 21], "queri": [1, 9, 25, 28], "overlap": [1, 9, 17], "ashton": [1, 4, 15], "under": [1, 4, 15], "lyne": [1, 4, 15], "fuzzi": [1, 4, 9], "distanc": [1, 7, 9], "wiltshrr": 1, "accur": 1, "when": [1, 4, 7, 8, 9, 10, 15, 17, 21], "come": [1, 4, 8], "ocr": [1, 4, 10, 12, 16], "variat": [1, 4, 9, 12], "veri": 1, "slow": 1, "embed": [1, 2, 5, 7, 12, 21, 23, 25], "It": [1, 7, 8, 9, 10, 12, 15, 19, 25], "hour": 1, "fastest": 1, "except": [1, 4], "respect": [1, 10, 17], "contain": [1, 4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 24, 25, 26], "describ": [1, 4, 5, 7], "trickier": 1, "ideal": 1, "captur": 1, "type": [1, 4, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 23, 24, 25, 28], "found": [1, 4, 8, 9, 18, 23, 24, 25], "errror": 1, "train_use_deezy_model_1": 1, "train_use_deezy_model_2": 1, "train_use_deezy_model_3": 1, "detail": [1, 7, 9, 15], "file": [1, 3, 4, 12, 16, 17, 24, 25, 26, 27], "w2v_ocr_pair": [1, 4, 12], "txt": [1, 12], "characters_v001": 1, "vocab": 1, "input_dfm": [1, 12], "yaml": [1, 12], "news_dataset": [1, 4], "mentions_to_wikidata_norm": 1, "wikidata_to_mentions_norm": 1, "pathlib": 1, "strvar_paramet": [1, 9, 12], "dict": [1, 7, 8, 9, 10, 12, 15, 16, 17, 24, 25], "deezy_paramet": [1, 9, 12], "filenam": 1, "dm_path": [1, 9, 12], "str": [1, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 23, 24, 25], "resolv": [1, 4, 9, 19, 28], "dm_cand": [1, 9], "wkdtalt": [1, 9], "dm_model": [1, 9], "w2v_ocr": [1, 9], "dm_output": [1, 9], "deezymatch_on_the_fli": [1, 9], "measur": [1, 4], "ranking_metr": [1, 9], "faiss": [1, 9], "selection_threshold": [1, 9], "50": [1, 9], "num_candid": [1, 9], "verbos": [1, 9], "readm": 1, "left": [1, 4], "empti": [1, 4, 7, 8, 9, 15, 16, 17, 25], "sinc": 1, "realli": 1, "metric": [1, 9, 10], "vector": [1, 4, 12, 21], "threshold": [1, 9, 12], "maximum": [1, 17], "overwrit": [1, 7, 10], "mode": [1, 7, 9, 10], "w2v": [1, 4, 9], "w2v_1800s_new": 1, "syn1neg": [1, 4], "npy": [1, 4], "wv": [1, 4], "w2v_1860s_new": 1, "ocr_threshold": [1, 9], "60": [1, 9], "top_threshold": [1, 9], "85": [1, 9], "min_len": [1, 9], "5": [1, 4, 9], "max_len": [1, 9], "15": [1, 9, 16], "w2v_ocr_path": [1, 9], "w2v_ocr_model": [1, 9], "w2v_": [1, 9], "_new": [1, 9], "overwrite_dataset": [1, 9], "fuzzywuzzi": 1, "ratio": [1, 12, 19], "consid": [1, 12, 15], "neg": [1, 4, 12], "minimum": 1, "length": [1, 4], "word": [1, 2, 5, 10, 12, 15, 16, 17, 21, 23, 25], "word2vec": [1, 12], "regular": [1, 21], "express": [1, 17], "wikidata_norm": 1, "thei": [1, 4, 9, 17, 18], "mentions_to_wikidata": [1, 9, 12, 25], "load_resourc": [1, 7, 9], "model_state_dict": 1, "find_candid": [1, 8, 9], "kei": [1, 4, 7, 8, 9, 10, 15, 16, 17, 24, 25], "alwai": [1, 4, 27], "valu": [1, 4, 7, 9, 15, 16, 17, 24, 25], "question": [1, 4], "mancheft": 1, "print": [1, 8, 9, 10, 16], "best": 1, "depend": [1, 3, 9], "unsupervis": [1, 4, 7], "popular": [1, 7], "term": [1, 21], "inlink": 1, "implement": [1, 9, 21, 25], "ment": 1, "norm": 1, "algorithm": 1, "propos": 1, "le": [1, 21], "titov": [1, 21], "2018": [1, 21], "ganea": [1, 21], "hofmann": [1, 21], "2017": [1, 21], "adapt": [1, 4, 7, 10, 15, 18, 19, 20, 21, 26], "know": 1, "van": [1, 7, 19, 20, 21, 23], "hulst": [1, 7, 19, 20, 21, 23], "johann": [1, 7, 19, 20, 21, 23], "m": [1, 7, 19, 20, 21, 23], "faegheh": [1, 7, 19, 20, 21, 23], "hasibi": [1, 7, 19, 20, 21, 23], "koen": [1, 7, 19, 20, 21, 23], "dercksen": [1, 7, 19, 20, 21, 23], "krisztian": [1, 7, 19, 20, 21, 23], "balog": [1, 7, 19, 20, 21, 23], "arjen": [1, 7, 19, 20, 21, 23], "de": [1, 7, 19, 20, 21, 23], "vri": [1, 7, 19, 20, 21, 23], "stand": [1, 7, 19, 20, 21, 23], "shoulder": [1, 7, 19, 20, 21, 23], "giant": [1, 7, 19, 20, 21, 23], "proceed": [1, 7, 19, 20, 21, 23, 27], "43rd": [1, 7, 19, 20, 21, 23], "intern": [1, 7, 19, 20, 21, 23], "acm": [1, 7, 19, 20, 21, 23], "sigir": [1, 7, 19, 20, 21, 23], "confer": [1, 7, 19, 20, 21, 23], "research": [1, 4, 7, 19, 20, 21, 23], "develop": [1, 4, 7, 19, 20, 21, 23], "pp": 1, "2197": 1, "2200": 1, "2020": [1, 7, 17, 19, 20, 21, 23], "phong": [1, 21], "ivan": [1, 21], "improv": [1, 4, 21], "latent": [1, 21], "relat": [1, 8, 9, 10, 21], "56th": [1, 21], "annual": [1, 21], "meet": [1, 21], "comput": [1, 9, 10, 19, 21], "linguist": [1, 21], "volum": [1, 21, 27], "1595": [1, 21], "1604": [1, 21], "octavian": [1, 21], "eugen": [1, 21], "thoma": [1, 21, 28], "deep": [1, 9, 19, 21], "joint": [1, 21], "neural": [1, 9, 21], "attent": [1, 21], "empir": [1, 21], "natur": [1, 8, 21], "languag": [1, 8, 12, 21], "process": [1, 4, 7, 8, 9, 10, 15, 16, 17, 21], "2619": [1, 21], "2629": [1, 21], "least": [1, 4], "entity2class": 1, "wikidata_gazett": 1, "sqlite3": [1, 4, 7], "connect": [1, 4, 7, 15, 25], "rel_db": [1, 4, 7], "embeddings_databas": [1, 4, 7], "db": [1, 4, 7, 18, 24], "conn": [1, 4, 7], "cursor": [1, 4, 7, 25], "rel_param": [1, 7, 8, 25], "data_path": [1, 7], "training_split": [1, 7], "originalsplit": [1, 4, 7, 24], "db_embed": [1, 7], "with_publ": [1, 7], "without_microtoponym": [1, 7, 8], "default_publnam": [1, 7], "default_publwqid": [1, 7], "specif": [1, 15, 16, 19, 24], "linking_df_split": [1, 4], "tsv": [1, 4, 16, 17, 24], "column": [1, 4, 9, 16], "databas": [1, 4, 7, 24, 25], "featur": 1, "filter": [1, 9, 12], "out": [1, 4], "microtoponym": [1, 8], "overrid": 1, "ignor": [1, 19], "As": 1, "infer": 1, "characterist": 1, "linking_resourc": [1, 7], "ed_model": 1, "train_load_model": [1, 7], "self": [1, 24], "whose": [1, 21, 25], "instal": [2, 5, 26, 27], "system": [2, 5, 9], "pyenv": [2, 5], "poetri": [2, 5, 27], "project": [2, 5], "hoook": [2, 5], "resourc": [2, 5, 7, 8, 9, 16, 24, 27], "disambigu": [2, 5, 7, 8, 19, 21, 25], "train": [2, 5, 7, 9, 10, 12, 15, 16, 19, 21, 25], "entiti": [2, 5, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25], "deezymatch": [2, 5, 9, 12], "summari": [2, 5], "The": [2, 4, 5, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 23, 24, 25, 26, 27, 28], "complet": [2, 5, 16], "tour": [2, 5], "pipelin": [2, 4, 6, 10, 11, 17, 26, 27], "recognis": [2, 5, 6, 8, 11, 17, 27], "ranker": [2, 5, 6, 7, 8, 11, 12, 25, 27], "linker": [2, 4, 5, 6, 8, 11, 19, 20, 21, 23, 27], "work": [3, 4, 5], "codebas": 3, "suggest": 3, "linux": 3, "ubuntu": 3, "20": [3, 7, 18, 19, 20, 21, 23, 28], "04": 3, "date": 3, "essenti": 3, "sudo": [3, 27], "apt": 3, "build": [3, 4, 5, 17, 26], "curl": [3, 28], "libbz2": 3, "dev": [3, 4], "libffi": 3, "liblzma": 3, "libncursesw5": 3, "libreadlin": 3, "libsqlite3": 3, "libssl": 3, "libxml2": 3, "libxmlsec1": 3, "llvm": 3, "tk": 3, "wget": 3, "xz": 3, "util": [3, 4, 5, 11, 27], "zlib1g": 3, "Then": 3, "manag": [3, 7], "virtual": 3, "environ": [3, 27], "http": [3, 12, 16, 17, 18, 27, 28], "bash": [3, 27], "And": 3, "properli": 3, "export": [3, 27], "echo": 3, "pyenv_root": 3, "home": [3, 27], "bashrc": 3, "bin": 3, "v": [3, 27], "1": [3, 5, 8, 9, 15, 17, 19, 21, 26], "null": 3, "2": [3, 5, 17, 26], "n": [3, 10, 12], "eval": 3, "init": 3, "nfi": 3, "restart": [3, 27], "session": [3, 4], "sourc": [3, 16], "environemnt": 3, "3": [3, 4, 5, 7, 16, 17, 19, 21, 26], "9": [3, 4], "7": [3, 4, 21], "dipend": 3, "across": [3, 9], "ssl": 3, "org": [3, 16, 18], "python3": 3, "now": [3, 4, 8, 17, 28], "repo": 3, "cd": 3, "git": 3, "github": [3, 4, 7, 12, 17, 18, 19, 20, 21, 23], "com": [3, 12, 17, 18], "machin": 3, "explicitli": 3, "tell": 3, "defin": [3, 12], "abov": [3, 4, 7, 17], "env": 3, "kernel": 3, "ipython": 3, "kernel_nam": 3, "activ": 3, "shell": 3, "usual": 3, "instanc": [3, 10, 15], "add": [3, 4, 21, 23, 25, 27], "packag": 3, "pytest": 3, "jupyt": 3, "guarante": 3, "style": 3, "consist": [3, 4, 12, 15, 16, 17, 18, 26], "basic": 3, "hook": 3, "sever": 4, "some": [4, 15], "load": [4, 7, 9, 10, 27], "web": 4, "gener": [4, 9, 12, 16, 25, 28], "give": 4, "option": [4, 7, 8, 9, 10, 12, 16, 17, 18, 23, 25, 27], "own": [4, 17, 26], "an": [4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 19, 20, 21, 23, 25], "focus": 4, "english": [4, 12], "topres19th": 4, "british": 4, "repositori": [4, 7, 13, 19, 20, 21, 23, 26, 27], "output": [4, 7, 8, 12, 17, 24], "ner_fine_test": 4, "continu": 4, "read": [4, 16], "descript": [4, 15], "don": 4, "ner": [4, 5, 8, 10, 11, 14, 16, 17], "model": [4, 7, 8, 9, 10, 12, 15, 16, 17, 19, 21, 25], "otherwis": [4, 9, 10, 17, 18], "correspond": [4, 5, 7, 8, 9, 12, 15, 16, 17, 18, 19, 24, 25], "pair": [4, 9, 12, 17], "string": [4, 8, 9, 12, 15, 16, 17, 24, 25], "token": [4, 10, 12, 15, 16, 17, 23, 24], "ner_tag": [4, 15, 16], "annot": [4, 16, 17, 24], "bio": [4, 16], "3896239_29": 4, "o": [4, 10, 15, 16, 17], "b": [4, 15, 16, 17], "street": 4, "old": 4, "millgat": 4, "collegi": 4, "church": 4, "arriv": 4, "littl": 4, "after": [4, 8, 16, 18], "ten": 4, "oclock": 4, "8262498_11": 4, "On": 4, "jsth": 4, "novemb": 4, "ship": 4, "santo": 4, "christo": 4, "monteveido": 4, "cadiz": 4, "hide": 4, "copper": 4, "10715509_7": 4, "coach": 4, "southampton": 4, "everi": 4, "morn": 4, "quarter": 4, "6": 4, "sundai": 4, "automat": [4, 28], "won": 4, "mostpopular": [4, 7, 8], "ed": [4, 19], "singl": [4, 8, 15, 28], "article_id": [4, 16, 17], "articl": [4, 16, 17, 24, 25], "origin": [4, 7, 8, 9, 15, 16, 17, 21, 23, 25], "1218_poole1860": 4, "1218": 4, "sentence_po": [4, 17], "sentence_text": 4, "dukinfield": 4, "knutsford": 4, "servant": 4, "girl": 4, "eliza": 4, "ann": 4, "byrom": 4, "who": 4, "stole": 4, "quantiti": 4, "cloth": 4, "hous": 4, "she": 4, "lodg": 4, "dukiafield": 4, "wa": [4, 7, 9, 10], "month": 4, "imprison": 4, "mention": [4, 5, 7, 8, 9, 12, 15, 17, 19, 21, 25], "mention_po": 4, "actual": [4, 16], "entity_typ": [4, 15], "wkdt_qid": [4, 16], "mention_start": 4, "charact": [4, 10, 15, 16, 17, 18, 24], "mention_end": 4, "sent_po": 4, "q1976179": 4, "q1470791": 4, "104": 4, "114": 4, "newspap": [4, 10], "belong": [4, 16], "manchest": [4, 28], "uk": 4, "leav": 4, "left_out": 4, "maintain": [4, 15], "66": 4, "33": 4, "divid": 4, "withouttest": 4, "seri": [4, 7, 9, 17, 19, 20, 21, 23], "These": 4, "wiki2gaz": 4, "soon": 4, "built": [4, 27], "content": [4, 28], "prefer": 4, "mean": [4, 16], "inner": [4, 17, 24], "absolut": 4, "count": [4, 25], "particular": 4, "access": [4, 8, 26], "open": [4, 25], "r": 4, "f": [4, 21, 27], "assign": [4, 7, 9, 15, 16, 17], "4457": 4, "map": [4, 8, 9, 12, 15, 16, 17, 18, 24, 25], "through": [4, 8, 9, 17], "normalis": [4, 8, 23], "9767696690773614": 4, "03125": 4, "frequenc": [4, 9], "005478851632697786": 4, "wilton": 4, "00021915406530791147": 4, "colleg": 4, "council": 4, "0015340784571553803": 4, "west": 4, "north": 4, "wilt": 4, "counti": 4, "0026298487836949377": 4, "010081087004163929": 4, "unit": [4, 7], "kingdom": [4, 7], "plain": [4, 24], "97": 4, "show": [4, 5, 8, 9, 28], "five": [4, 17], "panda": [4, 9, 16, 17, 25], "read_csv": 4, "head": 4, "english_label": 4, "q5059107": 4, "centenni": 4, "40": [4, 17], "01140": 4, "87": 4, "24330": 4, "q5059144": 4, "ground": [4, 19], "39": 4, "99270": 4, "75": [4, 9], "19380": 4, "q5059153": 4, "high": [4, 19], "school": 4, "06170": 4, "83": 4, "05780": 4, "q5059162": 4, "38": [4, 17], "30440": 4, "63800": 4, "4": [4, 5, 10, 17, 19, 26], "q5059178": 4, "memori": 4, "samsung": 4, "hall": 4, "37": 4, "58949": 4, "127": 4, "03434": 4, "onli": [4, 15, 17], "post": [4, 8], "render": 4, "dummi": 4, "float": [4, 7, 8, 9, 12, 17, 19], "interest": 4, "affect": 4, "likewis": 4, "q180673": 4, "cerimoni": 4, "wherea": 4, "complain": 4, "resolut": [4, 16], "wiki2vec": 4, "entity_embed": 4, "lower": [4, 18, 23], "preced": 4, "wildcard": 4, "unk": [4, 23], "emb": 4, "deriv": 4, "share": 4, "toward": 4, "meanwhil": 4, "pleas": 4, "index_enwiki": [4, 24], "latest": [4, 24, 27], "arrai": [4, 25], "execut": [4, 7, 9, 10], "lerwick": 4, "fetchon": 4, "els": 4, "tolist": 4, "3257000148296356": 4, "00989999994635582": 4, "13420000672340393": 4, "014700000174343586": 4, "007899999618530273": 4, "1808999925851822": 4, "candid": [4, 5, 7, 8, 9, 12, 17, 19, 21, 25], "perfectmatch": [4, 8, 9], "digitis": 4, "word1": 4, "word2": 4, "boolean": 4, "might": 4, "wish": 4, "hardli": 4, "didnot": 4, "never": 4, "reus": 4, "coeld": 4, "conld": 4, "couid": 4, "histor": [4, 10], "nois": [4, 9], "expand": 4, "extract": [4, 8, 9, 17, 25], "085": 4, "514": 4, "billion": 4, "corpu": 4, "19th": 4, "zenodo": 4, "period": 4, "year": [4, 7, 16, 17, 19, 20, 21, 23], "w2v_xxxxs_new": 4, "xxxx": 4, "decad": [4, 16], "1800": 4, "1810": 4, "w2v_1800_new": 4, "w2v_1810_new": 4, "tutori": [4, 10], "app": [4, 26, 27, 28], "evalu": [4, 5, 10, 17], "wikidta_gazett": 4, "mark": 4, "asterisk": 4, "instanti": [4, 7, 8, 27], "plu": 4, "sign": 4, "given": [5, 7, 8, 9, 10, 12, 16, 17, 18, 19, 21, 23, 25], "design": 5, "tackl": 5, "problem": [5, 15], "deploi": [5, 28], "api": [5, 24], "element": [5, 15, 16, 17, 24, 25], "modul": [5, 11], "t_re": [5, 6, 11, 14, 20], "deezy_process": [5, 11, 14], "get_data": [5, 11, 14], "preprocess_data": [5, 11, 14], "process_data": [5, 11, 14], "process_wikipedia": [5, 11, 14], "rel_e2": [5, 11, 14], "rel_util": [5, 11, 14], "multipl": [5, 15, 26], "via": [5, 26, 28], "docker": [5, 26], "compos": [5, 26], "configur": [5, 26], "deploy": [5, 26], "extern": 5, "search": 5, "random_se": [6, 7, 19, 20], "liter": [7, 9, 15, 25], "reldisamb": [7, 25], "bydist": 7, "experiments_path": [7, 8], "bool": [7, 8, 9, 10, 17, 18, 21, 23], "flag": [7, 9, 23], "radboud": [7, 19, 20, 21, 23], "entityt": 7, "establish": 7, "wpubl": [7, 27, 28], "wmtop": [7, 27, 28], "q145": 7, "by_dist": 7, "dict_ment": 7, "origin_wqid": 7, "tupl": [7, 8, 9, 12, 15, 16, 17, 24], "relev": [7, 8, 9], "calcul": [7, 9], "closest": 7, "round": 7, "decim": 7, "undertak": 7, "most_popular": 7, "determin": 7, "entitydisambigu": [7, 19, 20], "entity_disambigu": [7, 14, 20], "exist": [7, 9, 10, 12, 16, 18, 19], "initi": [7, 10, 17, 25], "had": 7, "credit": [7, 10, 15, 18, 19, 21, 23], "copyright": [7, 19, 20, 21, 23], "c": [7, 19, 20, 21, 23], "michael": [7, 19, 20, 21, 23], "permiss": [7, 19, 20, 21, 23], "inproceed": [7, 19, 20, 21, 23], "vanhulst": [7, 19, 20, 21, 23], "author": [7, 19, 20, 21, 23], "titl": [7, 16, 18, 19, 20, 21, 23, 24], "booktitl": [7, 19, 20, 21, 23], "retriev": [7, 12, 17, 19, 20, 21, 23, 24, 25], "publish": [7, 19, 20, 21, 23], "42": [7, 19], "repres": [8, 9, 10, 15, 17, 23, 25], "includ": [8, 9, 10, 16, 20], "setup": 8, "visit": 8, "pari": 8, "york": [8, 16, 18], "last": [8, 18], "summer": 8, "processed_data": [8, 17], "format_predict": 8, "wk_cand": [8, 25], "int": [8, 12, 15, 17, 23, 24], "document_dataset": 8, "nest": [8, 15], "outermost": 8, "potenti": 8, "second": [8, 9, 12, 17], "salop": 8, "q201970": 8, "0006031363088057901": 8, "q23103": 8, "0075279261777561925": 8, "postprocess_output": 8, "larger": 8, "postprocess": [8, 17], "exclud": [8, 25], "dure": [8, 15, 17, 23], "along": [8, 16, 25], "run_sentence_recognit": 8, "entir": 8, "keyword": [8, 12], "futur": 8, "point": [8, 17, 25], "standard": [8, 16, 17, 24], "remain": [8, 9], "redund": 8, "partialmatch": 9, "levenshtein": 9, "wikidata_to_ment": [9, 12], "already_collected_cand": 9, "perfect": 9, "handl": [9, 10], "collect": [9, 15], "applic": [9, 28], "paraguai": 9, "already_collect": 9, "mention_candid": 9, "mention_already_collect": 9, "check_if_contain": 9, "amount": 9, "within": 9, "degre": 9, "rang": 9, "appl": 9, "delici": 9, "match_scor": 9, "3333333333333333": 9, "damlev_dist": 9, "damerau": 9, "etiti": 9, "lowercas": [9, 18, 23, 25], "normal": [9, 19, 23], "subtract": 9, "orang": 9, "1666666865348816": 9, "deezy_on_the_fli": 9, "network": [9, 21], "fly": 9, "attribut": [9, 10], "shefrield": 9, "sheffield": 9, "03382000000000005": 9, "perfectli": 9, "sub": 9, "guadaloup": 9, "sn83030483": [9, 17], "1790": [9, 17], "03": [9, 17], "31": 9, "i0004_1": 9, "q17012": 9, "003935458480913026": 9, "q3153836": 9, "07407407407407407": 9, "appropri": [9, 15], "chosen": 9, "addition": 9, "q2477346": 9, "remov": [9, 18], "pandarallel": 9, "partial_match": 9, "damlev": 9, "banana": 9, "perfect_match": 9, "altnam": 9, "check": 9, "vari": 9, "further": 9, "barcelona": 9, "bologna": 9, "deleg": 9, "pipe": 10, "5e": 10, "05": 10, "uncas": 10, "create_pipelin": 10, "ner_predict": [10, 17], "99975187": [10, 17], "dash": 10, "replac": [10, 15, 18, 23], "comma": 10, "pars": [10, 25], "issu": [10, 15], "align": [10, 15, 17], "trainer": 10, "obtain_match": [12, 14], "english_word": 12, "sim": 12, "fuzz_ratio_threshold": 12, "union": [12, 15], "70": 12, "classifi": 12, "100": 12, "nearest": 12, "neighbor": 12, "discard": 12, "thefuzz": 12, "fuzz": 12, "seatgeek": 12, "simpl": 12, "create_training_set": [12, 14], "neighbour": 12, "randomli": 12, "insid": [12, 16], "train_deezy_model": [12, 14], "generate_candid": [12, 14], "write": 12, "download_lwm_data": [13, 14], "news_path": 13, "bl": 13, "unzip": 13, "download_hipe_data": [13, 14], "hipe_path": [13, 16], "training_tokenize_and_align_label": [14, 15], "collect_named_ent": [14, 15], "aggregate_ment": [14, 15], "fix_capit": [14, 15], "fix_hyphen": [14, 15], "fix_nest": [14, 15], "fix_startent": [14, 15], "aggregate_ent": [14, 15], "turn_wikipedia2wikidata": [14, 16], "reconstruct_sent": [14, 16], "process_lwm_for_n": [14, 16], "process_lwm_for_link": [14, 16], "aggregate_hipe_ent": [14, 16], "process_hipe_for_link": [14, 16], "process_tsv": [14, 16], "fine_to_coars": [14, 16], "eval_with_except": [14, 17, 25], "prepare_s": [14, 17], "align_gold": [14, 17], "postprocess_predict": [14, 17], "ner_and_process": [14, 17], "update_with_link": [14, 17], "update_with_skylin": [14, 17], "prepare_storing_link": [14, 17], "store_for_scor": [14, 17], "make_wikilinks_consist": [14, 18], "make_wikipedia2wikidata_consis": [14, 18], "title_to_id": [14, 18], "rel_end_to_end": [14, 24], "get_rel_from_api": [14, 24], "match_wikipedia_to_wikidata": [14, 24], "match_ent": [14, 24], "postprocess_rel": [14, 24], "store_rel": [14, 24], "run_rel_experi": [14, 24], "get_db_emb": [14, 25], "prepare_initial_data": [14, 25], "rank_candid": [14, 25], "add_publ": [14, 25], "prepare_rel_trainset": [14, 25], "mulrel_rank": [14, 20], "vocabulari": [14, 20], "pretrainedtoken": 15, "pretrainedtokenizerfast": 15, "label_encoding_dict": 15, "encod": [15, 18], "label2id": 15, "tokenization_utils_bas": 15, "batchencod": 15, "namedtupl": 15, "iter": 15, "over": [15, 19], "keep": [15, 17], "start_char": [15, 17], "end_char": [15, 17], "reserv": 15, "offset": [15, 24], "e_typ": 15, "start_offset": [15, 17], "end_offset": [15, 17], "pred": [15, 17], "aggreg": [15, 16], "consolid": 15, "reconstruct": [15, 16], "white": [15, 16], "space": [15, 16, 18], "haven": [15, 17], "yet": [15, 17], "manual": [15, 17], "ner_label": [15, 17], "entity_link": [15, 17], "correct": [15, 28], "capit": 15, "error": [15, 17, 25], "occur": [15, 17], "incorrect": 15, "surfac": 15, "form": [15, 25], "lentiti": [15, 16], "fix": 15, "prefix": [15, 16, 25, 27], "hyphen": 15, "incorrectli": 15, "address": 15, "group": 15, "sequenc": 15, "regard": 15, "phrase": 15, "solut": 15, "current": [15, 16, 17], "part": [15, 16, 17, 18], "island": 15, "terceira": 15, "preposit": 15, "begin": 15, "instead": [15, 17, 26], "join": [15, 16], "wikipedia_titl": 16, "wikipedia_path": 16, "convert": [16, 18, 23, 25], "avail": [16, 21, 27, 28], "wiki": [16, 18], "colosseum": 16, "q10285": 16, "ancient_egypt": 16, "q11768": 16, "invalid_loc": 16, "warn": 16, "wikipedia2wikidata": [16, 18], "dtoken": 16, "ensur": 16, "tsv_topres_path": 16, "scheme": 16, "10813493_1": 16, "document_id": 16, "_": [16, 17], "sentence_id": [16, 17], "india": 16, "annotated_tsv": 16, "occurr": 16, "resources_dir": 16, "gazetteer_id": [16, 24], "ocr_quality_mean": [16, 17], "qualiti": 16, "ocr_quality_sd": [16, 17], "deviat": 16, "publication_titl": [16, 17], "publication_cod": [16, 17], "metadata": [16, 17], "consecut": 16, "ne_typ": 16, "q60": 16, "12": 16, "meto_typ": 16, "updated_ent": 16, "multi": [16, 21], "help": 16, "contigu": 16, "filepath": 16, "webanno": 16, "dmtoken": 16, "six": 16, "url": [16, 18], "grain": 16, "coars": 16, "equival": 16, "str2pars": [17, 25], "in_cas": [17, 25], "ast": 17, "literal_ev": 17, "succe": 17, "valueerror": 17, "success": [17, 25], "dsentenc": [17, 24], "10732214_1": 17, "10732214": 17, "unprocess": 17, "dannot": 17, "anoth": 17, "dmetadata": 17, "produc": [17, 21], "relabel": 17, "petr": 17, "q335322": 17, "gold_posit": 17, "later": 17, "tokenis": 17, "enabl": [17, 27], "assess": 17, "sentence_pr": 17, "represent": 17, "sentence_tru": 17, "sentence_ski": 17, "dpred": 17, "999826967716217": 17, "_ner_predict": 17, "dtrue": 17, "_gold_standard": 17, "dsky": 17, "skylin": 17, "At": 17, "without": 17, "fill": 17, "_ner_skylin": 17, "gold_token": [17, 24], "unitec": 17, "193": 17, "199": 17, "q30": 17, "_gold_posit": 17, "dmentionspr": 17, "i0001_9": 17, "state": 17, "206": 17, "79": 17, "_pred_ment": 17, "dmentionsgold": 17, "analog": 17, "link_predict": 17, "incorpor": 17, "test_df": 17, "all_test": 17, "end_to_end_ev": 17, "sky": 17, "achiev": 17, "choos": 17, "among": 17, "u": 17, "hipe_scorer_results_path": 17, "scenario_nam": 17, "dresult": 17, "articles_test": 17, "clef": 17, "conll": [17, 24], "impresso": 17, "oper": 18, "unquot": 18, "decod": 18, "percent": 18, "underscor": [18, 24], "fragment": 18, "symbol": 18, "quot": 18, "modifi": 18, "special": 18, "python_": 18, "programming_languag": 18, "overview": 18, "28program": 18, "20languag": 18, "29": 18, "data_sci": 18, "20scienc": 18, "san_francisco": 18, "san": 18, "20francisco": 18, "mapper": 18, "make_wikipedia2wikidata_consist": 18, "new_york_c": 18, "scienc": 18, "page_titl": 18, "path_to_db": [18, 24], "unescap": 18, "fermat": 18, "27s_last_theorem": 18, "s_last_theorem": 18, "wikidata2wikipedia": 18, "entri": 18, "manate": 18, "wiki_page_titl": 18, "jcklie": 18, "wikimapp": 18, "db_emb": 19, "user_config": 19, "reset_embed": 19, "architectur": 19, "mulrelrank": [19, 20, 21], "get_data_item": 19, "dname": 19, "respons": [19, 21], "trigger": 19, "prerank": [19, 20, 21], "normalize_scor": 19, "rescal": 19, "sum": 19, "truth": 19, "possibl": [19, 25], "max": 19, "org_train_dataset": 19, "org_dev_dataset": 19, "train_lr": 19, "train_json": 19, "dev_json": 19, "model_path_lr": 19, "lr": 19, "recal": 19, "low": 19, "corrrect": 19, "config": [21, 26, 27], "multipli": 21, "minim": 21, "forward": 21, "token_id": 21, "token_offset": 21, "entity_id": 21, "devic": 21, "mulrel": 21, "nel": 21, "ganea2017deep": 21, "le2018improv": 21, "tok_mask": 21, "entity_mask": 21, "p_e_m": 21, "ctx_layer": 21, "figur": 21, "ent_scor": 21, "q": 21, "score_combin": 21, "loss": 21, "true_po": 21, "lamb": 21, "1e": 21, "07": 21, "equat": 21, "max_norm": 21, "add_to_vocab": 23, "get_id": 23, "unknown": 23, "static": 23, "digit_0": 23, "rule": [23, 27], "digit": 23, "unk_token": 23, "sent": 24, "rel_end2end_path": 24, "wiki_titl": 24, "separ": 24, "pred_ent": 24, "prev_ann": 24, "rel_pr": 24, "wikigaz_id": 24, "retoken": 24, "drel": 24, "how_split": 24, "ashton1860": 24, "embtyp": 25, "snd": 25, "ndarrai": 25, "wikipedia2vec": 25, "np": 25, "preappend": 25, "rel_json": 25, "publnam": 25, "publwqid": 25, "dsplit": 25, "fastapi": 26, "remot": 26, "app_templ": [26, 27], "config_nam": 26, "templat": [26, 27], "dockerfil": [26, 27], "yml": [26, 27], "fit": 26, "standalon": 27, "simultan": 27, "behind": 27, "revers": 27, "proxi": 27, "traefik": 27, "server": [27, 28], "offici": 27, "guid": 27, "imag": 27, "res_deezy_reldisamb": [27, 28], "root": 27, "container_nam": 27, "cach": 27, "arg": 27, "app_nam": 27, "_imag": 27, "ref": 27, "preprocess": 27, "balanc": 27, "host_url": 27, "your_host_url": 27, "d": [27, 28], "edit": 27, "endpoint": [27, 28], "behaviour": 27, "variabl": 27, "servic": 27, "your_config_nam": 27, "expos": [27, 28], "loadbalanc": 27, "port": [27, 28], "router": 27, "_router": 27, "host": 27, "pathprefix": 27, "v2": [27, 28], "res_": 27, "middlewar": 27, "stripprefix": 27, "rwop": 27, "uvicorn": 27, "header": 27, "8000": 28, "interact": 28, "swagger": 28, "doc": 28, "184": 28, "45": 28, "h": 28, "harvei": 28, "elizabeth": 28, "barnett": 28, "q18125": 28, "api_usag": 28, "variou": 28}, "objects": {"t_res.geoparser.linking": [[7, 0, 1, "", "Linker"], [7, 2, 1, "", "RANDOM_SEED"]], "t_res.geoparser.linking.Linker": [[7, 1, 1, "", "by_distance"], [7, 1, 1, "", "load_resources"], [7, 1, 1, "", "most_popular"], [7, 1, 1, "", "run"], [7, 1, 1, "", "train_load_model"]], "t_res.geoparser.pipeline": [[8, 0, 1, "", "Pipeline"]], "t_res.geoparser.pipeline.Pipeline": [[8, 1, 1, "", "format_prediction"], [8, 1, 1, "", "run_candidate_selection"], [8, 1, 1, "", "run_disambiguation"], [8, 1, 1, "", "run_sentence"], [8, 1, 1, "", "run_sentence_recognition"], [8, 1, 1, "", "run_text"], [8, 1, 1, "", "run_text_recognition"]], "t_res.geoparser.ranking": [[9, 0, 1, "", "Ranker"]], "t_res.geoparser.ranking.Ranker": [[9, 1, 1, "", "check_if_contained"], [9, 1, 1, "", "damlev_dist"], [9, 1, 1, "", "deezy_on_the_fly"], [9, 1, 1, "", "find_candidates"], [9, 1, 1, "", "load_resources"], [9, 1, 1, "", "partial_match"], [9, 1, 1, "", "perfect_match"], [9, 1, 1, "", "run"], [9, 1, 1, "", "train"]], "t_res.geoparser.recogniser": [[10, 0, 1, "", "Recogniser"]], "t_res.geoparser.recogniser.Recogniser": [[10, 1, 1, "", "create_pipeline"], [10, 1, 1, "", "ner_predict"], [10, 1, 1, "", "train"]], "t_res.utils.REL.entity_disambiguation": [[19, 0, 1, "", "EntityDisambiguation"], [19, 2, 1, "", "RANDOM_SEED"]], "t_res.utils.REL.entity_disambiguation.EntityDisambiguation": [[19, 1, 1, "", "get_data_items"], [19, 1, 1, "", "normalize_scores"], [19, 1, 1, "", "predict"], [19, 1, 1, "", "prerank"], [19, 1, 1, "", "train"], [19, 1, 1, "", "train_LR"]], "t_res.utils.REL.mulrel_ranker": [[21, 0, 1, "", "MulRelRanker"], [21, 0, 1, "", "PreRank"]], "t_res.utils.REL.mulrel_ranker.MulRelRanker": [[21, 1, 1, "", "forward"], [21, 1, 1, "", "loss"], [21, 1, 1, "", "regularize"], [21, 2, 1, "", "training"]], "t_res.utils.REL.mulrel_ranker.PreRank": [[21, 1, 1, "", "forward"], [21, 2, 1, "", "training"]], "t_res.utils.REL.vocabulary": [[23, 0, 1, "", "Vocabulary"]], "t_res.utils.REL.vocabulary.Vocabulary": [[23, 1, 1, "", "add_to_vocab"], [23, 1, 1, "", "get_id"], [23, 1, 1, "", "normalize"], [23, 1, 1, "", "size"], [23, 2, 1, "", "unk_token"]], "t_res.utils.deezy_processing": [[12, 3, 1, "", "create_training_set"], [12, 3, 1, "", "generate_candidates"], [12, 3, 1, "", "obtain_matches"], [12, 3, 1, "", "train_deezy_model"]], "t_res.utils.get_data": [[13, 3, 1, "", "download_hipe_data"], [13, 3, 1, "", "download_lwm_data"]], "t_res.utils.ner": [[15, 3, 1, "", "aggregate_entities"], [15, 3, 1, "", "aggregate_mentions"], [15, 3, 1, "", "collect_named_entities"], [15, 3, 1, "", "fix_capitalization"], [15, 3, 1, "", "fix_hyphens"], [15, 3, 1, "", "fix_nested"], [15, 3, 1, "", "fix_startEntity"], [15, 3, 1, "", "training_tokenize_and_align_labels"]], "t_res.utils": [[16, 4, 0, "-", "preprocess_data"]], "t_res.utils.preprocess_data": [[16, 3, 1, "", "aggregate_hipe_entities"], [16, 3, 1, "", "fine_to_coarse"], [16, 3, 1, "", "process_hipe_for_linking"], [16, 3, 1, "", "process_lwm_for_linking"], [16, 3, 1, "", "process_lwm_for_ner"], [16, 3, 1, "", "process_tsv"], [16, 3, 1, "", "reconstruct_sentences"], [16, 3, 1, "", "turn_wikipedia2wikidata"]], "t_res.utils.process_data": [[17, 3, 1, "", "align_gold"], [17, 3, 1, "", "eval_with_exception"], [17, 3, 1, "", "ner_and_process"], [17, 3, 1, "", "postprocess_predictions"], [17, 3, 1, "", "prepare_sents"], [17, 3, 1, "", "prepare_storing_links"], [17, 3, 1, "", "store_for_scorer"], [17, 3, 1, "", "update_with_linking"], [17, 3, 1, "", "update_with_skyline"]], "t_res.utils.process_wikipedia": [[18, 3, 1, "", "make_wikilinks_consistent"], [18, 3, 1, "", "make_wikipedia2wikidata_consisent"], [18, 3, 1, "", "title_to_id"]], "t_res.utils.rel_e2e": [[24, 3, 1, "", "get_rel_from_api"], [24, 3, 1, "", "match_ent"], [24, 3, 1, "", "match_wikipedia_to_wikidata"], [24, 3, 1, "", "postprocess_rel"], [24, 3, 1, "", "rel_end_to_end"], [24, 3, 1, "", "run_rel_experiments"], [24, 3, 1, "", "store_rel"]], "t_res.utils.rel_utils": [[25, 3, 1, "", "add_publication"], [25, 3, 1, "", "eval_with_exception"], [25, 3, 1, "", "get_db_emb"], [25, 3, 1, "", "prepare_initial_data"], [25, 3, 1, "", "prepare_rel_trainset"], [25, 3, 1, "", "rank_candidates"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:attribute", "3": "py:function", "4": "py:module"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "function", "Python function"], "4": ["py", "module", "Python module"]}, "titleterms": {"experi": 0, "evalu": 0, "1": [0, 1, 4, 27], "obtain": 0, "extern": 0, "resourc": [0, 1, 4], "2": [0, 1, 4, 27], "prepar": 0, "data": [0, 4], "3": [0, 1, 27], "run": 0, "4": [0, 1, 27], "The": 1, "complet": 1, "tour": 1, "pipelin": [1, 5, 8], "instanti": 1, "us": [1, 3, 28], "end": 1, "step": 1, "descript": 1, "output": 1, "recommend": 1, "recognis": [1, 10], "train": [1, 4], "ner": [1, 15], "model": 1, "ranker": [1, 9], "perfectmatch": 1, "partialmatch": 1, "levenshtein": 1, "deezymatch": [1, 4], "option": 1, "from": 1, "scratch": 1, "given": 1, "an": 1, "exist": 1, "string": 1, "pair": 1, "dataset": [1, 4], "includ": 1, "gener": 1, "load": 1, "retriev": 1, "candid": 1, "mention": 1, "linker": [1, 7], "mostpopular": 1, "reldisamb": 1, "entiti": [1, 4], "disambigu": [1, 4], "get": 2, "start": 2, "tabl": [2, 5, 6, 11, 14, 20, 26], "content": [2, 5, 6, 11, 14, 20, 26], "instal": 3, "t": [3, 5, 26, 27, 28], "re": [3, 5, 26, 27, 28], "updat": 3, "system": 3, "pyenv": 3, "poetri": 3, "project": 3, "how": 3, "pre": 3, "commit": 3, "hoook": 3, "directori": 4, "structur": 4, "toponym": [4, 5], "recognit": 4, "wikipedia": 4, "wikidata": 4, "base": 4, "mentions_to_wikidata": 4, "json": 4, "mentions_to_wikidata_norm": 4, "wikidata_to_mentions_norm": 4, "wikidata_gazett": 4, "csv": 4, "entity2class": 4, "txt": 4, "word": 4, "embed": 4, "set": 4, "word2vec": 4, "noisi": 4, "summari": 4, "A": 5, "resolut": 5, "digitis": 5, "histor": 5, "newspap": 5, "indic": 5, "geopars": [6, 7, 8, 9, 10], "modul": [6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], "t_re": [7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25], "link": 7, "rank": 9, "refer": 11, "util": [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], "deezy_process": 12, "get_data": 13, "preprocess_data": 16, "process_data": 17, "process_wikipedia": 18, "rel": [19, 20, 21, 22, 23], "entity_disambigu": 19, "mulrel_rank": 21, "vocabulari": 23, "rel_e2": 24, "rel_util": 25, "deploi": [26, 27], "api": [26, 27, 28], "build": 27, "contain": 27, "multipl": 27, "via": 27, "docker": 27, "compos": 27, "configur": 27, "your": 27, "deploy": 27}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx": 57}, "alltitles": {"Experiments and evaluation": [[0, "experiments-and-evaluation"]], "1. Obtain the external resources": [[0, "obtain-the-external-resources"]], "2. Preparing the data": [[0, "preparing-the-data"]], "3. Running the experiments": [[0, "running-the-experiments"]], "4. Evaluate": [[0, "evaluate"]], "The complete tour": [[1, "the-complete-tour"]], "The Pipeline": [[1, "the-pipeline"]], "1. Instantiate the Pipeline": [[1, "instantiate-the-pipeline"]], "2. Use the Pipeline": [[1, "use-the-pipeline"]], "End-to-end pipeline": [[1, "end-to-end-pipeline"]], "Step-by-step pipeline": [[1, "step-by-step-pipeline"]], "Description of the output": [[1, "description-of-the-output"]], "Pipeline recommendations": [[1, "pipeline-recommendations"]], "The Recogniser": [[1, "the-recogniser"]], "1. Instantiate the Recogniser": [[1, "instantiate-the-recogniser"]], "2. Train the NER model": [[1, "train-the-ner-model"]], "The Ranker": [[1, "the-ranker"]], "1. Instantiate the Ranker": [[1, "instantiate-the-ranker"]], "1.1. Perfectmatch, partialmatch, and levenshtein": [[1, "perfectmatch-partialmatch-and-levenshtein"]], "1.2. DeezyMatch": [[1, "id4"]], "Option 1. Train a DeezyMatch model from scratch, given an existing string pairs dataset": [[1, "option-1-train-a-deezymatch-model-from-scratch-given-an-existing-string-pairs-dataset"]], "Option 2. Train a DeezyMatch model from scratch, including generating a string pairs dataset": [[1, "option-2-train-a-deezymatch-model-from-scratch-including-generating-a-string-pairs-dataset"]], "2. Load the resources": [[1, "load-the-resources"], [1, "id9"]], "3. Train a DeezyMatch model": [[1, "train-a-deezymatch-model"]], "4. Retrieve candidates for a given mention": [[1, "retrieve-candidates-for-a-given-mention"]], "The Linker": [[1, "the-linker"]], "1. Instantiate the Linker": [[1, "instantiate-the-linker"]], "1.1. mostpopular": [[1, "mostpopular"]], "1.2. reldisamb": [[1, "reldisamb"]], "3. Train an entity disambiguation model": [[1, "train-an-entity-disambiguation-model"]], "Getting started": [[2, "getting-started"]], "Table of contents:": [[2, null], [5, null], [6, null], [11, null], [14, null], [20, null], [26, null]], "Installing T-Res": [[3, "installing-t-res"]], "Update the system": [[3, "update-the-system"]], "Install pyenv": [[3, "install-pyenv"]], "Install poetry": [[3, "install-poetry"]], "Project Installation": [[3, "project-installation"]], "How to use poetry": [[3, "how-to-use-poetry"]], "Pre-commit hoooks": [[3, "pre-commit-hoooks"]], "Resources and directory structure": [[4, "resources-and-directory-structure"]], "Toponym recognition and disambiguation training data": [[4, "toponym-recognition-and-disambiguation-training-data"]], "1. Toponym recognition dataset": [[4, "toponym-recognition-dataset"]], "2. Toponym disambiguation dataset": [[4, "toponym-disambiguation-dataset"]], "Wikipedia- and Wikidata-based resources": [[4, "wikipedia-and-wikidata-based-resources"]], "mentions_to_wikidata.json": [[4, "mentions-to-wikidata-json"]], "mentions_to_wikidata_normalized.json": [[4, "mentions-to-wikidata-normalized-json"]], "wikidata_to_mentions_normalized.json": [[4, "wikidata-to-mentions-normalized-json"]], "wikidata_gazetteer.csv": [[4, "wikidata-gazetteer-csv"]], "entity2class.txt": [[4, "entity2class-txt"]], "Entity and word embeddings": [[4, "entity-and-word-embeddings"]], "DeezyMatch training set": [[4, "deezymatch-training-set"]], "1. DeezyMatch training set": [[4, "id6"]], "2. Word2Vec embeddings trained on noisy data": [[4, "word2vec-embeddings-trained-on-noisy-data"]], "Summary of resources and directory structure": [[4, "summary-of-resources-and-directory-structure"]], "T-Res: A Toponym Resolution Pipeline for Digitised Historical Newspapers": [[5, "t-res-a-toponym-resolution-pipeline-for-digitised-historical-newspapers"]], "Indices and tables": [[5, "indices-and-tables"]], "geoparser module": [[6, "geoparser-module"]], "t_res.geoparser.linking.Linker": [[7, "t-res-geoparser-linking-linker"]], "t_res.geoparser.pipeline.Pipeline": [[8, "t-res-geoparser-pipeline-pipeline"]], "t_res.geoparser.ranking. Ranker": [[9, "t-res-geoparser-ranking-ranker"]], "t_res.geoparser.recogniser.Recogniser": [[10, "t-res-geoparser-recogniser-recogniser"]], "Reference": [[11, "reference"]], "t_res.utils.deezy_processing module": [[12, "t-res-utils-deezy-processing-module"]], "t_res.utils.get_data module": [[13, "t-res-utils-get-data-module"]], "utils module": [[14, "utils-module"]], "t_res.utils.ner module": [[15, "t-res-utils-ner-module"]], "t_res.utils.preprocess_data module": [[16, "t-res-utils-preprocess-data-module"]], "t_res.utils.process_data module": [[17, "t-res-utils-process-data-module"]], "t_res.utils.process_wikipedia module": [[18, "t-res-utils-process-wikipedia-module"]], "t_res.utils.REL.entity_disambiguation module": [[19, "t-res-utils-rel-entity-disambiguation-module"]], "utils.REL module": [[20, "utils-rel-module"]], "t_res.utils.REL.mulrel_ranker module": [[21, "t-res-utils-rel-mulrel-ranker-module"]], "t_res.utils.REL.t_res.utils module": [[22, "t-res-utils-rel-t-res-utils-module"]], "t_res.utils.REL.vocabulary module": [[23, "t-res-utils-rel-vocabulary-module"]], "t_res.utils.rel_e2e module": [[24, "t-res-utils-rel-e2e-module"]], "t_res.utils.rel_utils module": [[25, "t-res-utils-rel-utils-module"]], "Deploying the T-Res API": [[26, "deploying-the-t-res-api"], [27, "deploying-the-t-res-api"]], "1. Building the container": [[27, "building-the-container"]], "2. Deploying the container": [[27, "deploying-the-container"]], "3. Deploying multiple containers via Docker Compose": [[27, "deploying-multiple-containers-via-docker-compose"]], "4. Configuring your deployment": [[27, "configuring-your-deployment"]], "Using the T-Res API": [[28, "using-the-t-res-api"]]}, "indexentries": {"linker (class in t_res.geoparser.linking)": [[7, "t_res.geoparser.linking.Linker"]], "random_seed (t_res.geoparser.linking attribute)": [[7, "t_res.geoparser.linking.RANDOM_SEED"]], "by_distance() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.by_distance"]], "load_resources() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.load_resources"]], "most_popular() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.most_popular"]], "run() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.run"]], "train_load_model() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.train_load_model"]], "pipeline (class in t_res.geoparser.pipeline)": [[8, "t_res.geoparser.pipeline.Pipeline"]], "format_prediction() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.format_prediction"]], "run_candidate_selection() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_candidate_selection"]], "run_disambiguation() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_disambiguation"]], "run_sentence() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_sentence"]], "run_sentence_recognition() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_sentence_recognition"]], "run_text() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_text"]], "run_text_recognition() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_text_recognition"]], "ranker (class in t_res.geoparser.ranking)": [[9, "t_res.geoparser.ranking.Ranker"]], "check_if_contained() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.check_if_contained"]], "damlev_dist() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.damlev_dist"]], "deezy_on_the_fly() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.deezy_on_the_fly"]], "find_candidates() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.find_candidates"]], "load_resources() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.load_resources"]], "partial_match() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.partial_match"]], "perfect_match() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.perfect_match"]], "run() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.run"]], "train() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.train"]], "recogniser (class in t_res.geoparser.recogniser)": [[10, "t_res.geoparser.recogniser.Recogniser"]], "create_pipeline() (t_res.geoparser.recogniser.recogniser method)": [[10, "t_res.geoparser.recogniser.Recogniser.create_pipeline"]], "ner_predict() (t_res.geoparser.recogniser.recogniser method)": [[10, "t_res.geoparser.recogniser.Recogniser.ner_predict"]], "train() (t_res.geoparser.recogniser.recogniser method)": [[10, "t_res.geoparser.recogniser.Recogniser.train"]], "create_training_set() (in module t_res.utils.deezy_processing)": [[12, "t_res.utils.deezy_processing.create_training_set"]], "generate_candidates() (in module t_res.utils.deezy_processing)": [[12, "t_res.utils.deezy_processing.generate_candidates"]], "obtain_matches() (in module t_res.utils.deezy_processing)": [[12, "t_res.utils.deezy_processing.obtain_matches"]], "train_deezy_model() (in module t_res.utils.deezy_processing)": [[12, "t_res.utils.deezy_processing.train_deezy_model"]], "download_hipe_data() (in module t_res.utils.get_data)": [[13, "t_res.utils.get_data.download_hipe_data"]], "download_lwm_data() (in module t_res.utils.get_data)": [[13, "t_res.utils.get_data.download_lwm_data"]], "aggregate_entities() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.aggregate_entities"]], "aggregate_mentions() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.aggregate_mentions"]], "collect_named_entities() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.collect_named_entities"]], "fix_capitalization() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.fix_capitalization"]], "fix_hyphens() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.fix_hyphens"]], "fix_nested() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.fix_nested"]], "fix_startentity() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.fix_startEntity"]], "training_tokenize_and_align_labels() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.training_tokenize_and_align_labels"]], "aggregate_hipe_entities() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.aggregate_hipe_entities"]], "fine_to_coarse() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.fine_to_coarse"]], "module": [[16, "module-t_res.utils.preprocess_data"]], "process_hipe_for_linking() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.process_hipe_for_linking"]], "process_lwm_for_linking() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.process_lwm_for_linking"]], "process_lwm_for_ner() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.process_lwm_for_ner"]], "process_tsv() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.process_tsv"]], "reconstruct_sentences() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.reconstruct_sentences"]], "t_res.utils.preprocess_data": [[16, "module-t_res.utils.preprocess_data"]], "turn_wikipedia2wikidata() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.turn_wikipedia2wikidata"]], "align_gold() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.align_gold"]], "eval_with_exception() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.eval_with_exception"]], "ner_and_process() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.ner_and_process"]], "postprocess_predictions() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.postprocess_predictions"]], "prepare_sents() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.prepare_sents"]], "prepare_storing_links() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.prepare_storing_links"]], "store_for_scorer() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.store_for_scorer"]], "update_with_linking() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.update_with_linking"]], "update_with_skyline() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.update_with_skyline"]], "make_wikilinks_consistent() (in module t_res.utils.process_wikipedia)": [[18, "t_res.utils.process_wikipedia.make_wikilinks_consistent"]], "make_wikipedia2wikidata_consisent() (in module t_res.utils.process_wikipedia)": [[18, "t_res.utils.process_wikipedia.make_wikipedia2wikidata_consisent"]], "title_to_id() (in module t_res.utils.process_wikipedia)": [[18, "t_res.utils.process_wikipedia.title_to_id"]], "entitydisambiguation (class in t_res.utils.rel.entity_disambiguation)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation"]], "random_seed (t_res.utils.rel.entity_disambiguation attribute)": [[19, "t_res.utils.REL.entity_disambiguation.RANDOM_SEED"]], "get_data_items() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.get_data_items"]], "normalize_scores() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.normalize_scores"]], "predict() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.predict"]], "prerank() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.prerank"]], "train() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.train"]], "train_lr() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.train_LR"]], "mulrelranker (class in t_res.utils.rel.mulrel_ranker)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker"]], "prerank (class in t_res.utils.rel.mulrel_ranker)": [[21, "t_res.utils.REL.mulrel_ranker.PreRank"]], "forward() (t_res.utils.rel.mulrel_ranker.mulrelranker method)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker.forward"]], "forward() (t_res.utils.rel.mulrel_ranker.prerank method)": [[21, "t_res.utils.REL.mulrel_ranker.PreRank.forward"]], "loss() (t_res.utils.rel.mulrel_ranker.mulrelranker method)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker.loss"]], "regularize() (t_res.utils.rel.mulrel_ranker.mulrelranker method)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker.regularize"]], "training (t_res.utils.rel.mulrel_ranker.mulrelranker attribute)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker.training"]], "training (t_res.utils.rel.mulrel_ranker.prerank attribute)": [[21, "t_res.utils.REL.mulrel_ranker.PreRank.training"]], "vocabulary (class in t_res.utils.rel.vocabulary)": [[23, "t_res.utils.REL.vocabulary.Vocabulary"]], "add_to_vocab() (t_res.utils.rel.vocabulary.vocabulary method)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.add_to_vocab"]], "get_id() (t_res.utils.rel.vocabulary.vocabulary method)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.get_id"]], "normalize() (t_res.utils.rel.vocabulary.vocabulary static method)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.normalize"]], "size() (t_res.utils.rel.vocabulary.vocabulary method)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.size"]], "unk_token (t_res.utils.rel.vocabulary.vocabulary attribute)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.unk_token"]], "get_rel_from_api() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.get_rel_from_api"]], "match_ent() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.match_ent"]], "match_wikipedia_to_wikidata() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.match_wikipedia_to_wikidata"]], "postprocess_rel() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.postprocess_rel"]], "rel_end_to_end() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.rel_end_to_end"]], "run_rel_experiments() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.run_rel_experiments"]], "store_rel() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.store_rel"]], "add_publication() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.add_publication"]], "eval_with_exception() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.eval_with_exception"]], "get_db_emb() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.get_db_emb"]], "prepare_initial_data() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.prepare_initial_data"]], "prepare_rel_trainset() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.prepare_rel_trainset"]], "rank_candidates() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.rank_candidates"]]}})
\ No newline at end of file
+Search.setIndex({"docnames": ["experiments/index", "getting-started/complete-tour", "getting-started/index", "getting-started/installation", "getting-started/resources", "index", "reference/geoparser/index", "reference/geoparser/linker", "reference/geoparser/pipeline", "reference/geoparser/ranker", "reference/geoparser/recogniser", "reference/index", "reference/utils/deezy_processing", "reference/utils/get_data", "reference/utils/index", "reference/utils/ner", "reference/utils/preprocess_data", "reference/utils/process_data", "reference/utils/process_wikipedia", "reference/utils/rel/entity_disambiguation", "reference/utils/rel/index", "reference/utils/rel/mulrel_ranker", "reference/utils/rel/utils", "reference/utils/rel/vocabulary", "reference/utils/rel_e2e", "reference/utils/rel_utils", "t-res-api/index", "t-res-api/installation", "t-res-api/usage"], "filenames": ["experiments/index.rst", "getting-started/complete-tour.rst", "getting-started/index.rst", "getting-started/installation.rst", "getting-started/resources.rst", "index.rst", "reference/geoparser/index.rst", "reference/geoparser/linker.rst", "reference/geoparser/pipeline.rst", "reference/geoparser/ranker.rst", "reference/geoparser/recogniser.rst", "reference/index.rst", "reference/utils/deezy_processing.rst", "reference/utils/get_data.rst", "reference/utils/index.rst", "reference/utils/ner.rst", "reference/utils/preprocess_data.rst", "reference/utils/process_data.rst", "reference/utils/process_wikipedia.rst", "reference/utils/rel/entity_disambiguation.rst", "reference/utils/rel/index.rst", "reference/utils/rel/mulrel_ranker.rst", "reference/utils/rel/utils.rst", "reference/utils/rel/vocabulary.rst", "reference/utils/rel_e2e.rst", "reference/utils/rel_utils.rst", "t-res-api/index.rst", "t-res-api/installation.rst", "t-res-api/usage.rst"], "titles": ["Experiments and evaluation", "The complete tour", "Getting started", "Installing T-Res", "Resources and directory structure", "T-Res: A Toponym Resolution Pipeline for Digitised Historical Newspapers", "<code class=\"docutils literal notranslate\"><span class=\"pre\">geoparser</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.geoparser.linking.Linker</span></code>", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.geoparser.pipeline.Pipeline</span></code>", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.geoparser.ranking.</span> <span class=\"pre\">Ranker</span></code>", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.geoparser.recogniser.Recogniser</span></code>", "Reference", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.deezy_processing</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.get_data</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">utils</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.ner</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.preprocess_data</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.process_data</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.process_wikipedia</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.REL.entity_disambiguation</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">utils.REL</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.REL.mulrel_ranker</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.REL.t_res.utils</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.REL.vocabulary</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.rel_e2e</span></code> module", "<code class=\"docutils literal notranslate\"><span class=\"pre\">t_res.utils.rel_utils</span></code> module", "Deploying the T-Res API", "Deploying the T-Res API", "Using the T-Res API"], "terms": {"follow": [0, 1, 3, 4, 8, 9, 15, 16, 17, 18, 26, 27, 28], "step": [0, 4, 8, 17, 19, 27, 28], "reproduc": 0, "our": [0, 1, 3, 4, 16], "paper": [0, 1, 21], "instruct": [0, 3, 4, 8], "directori": [0, 1, 2, 5, 7, 8, 9, 12, 16, 27], "structur": [0, 1, 2, 5, 7, 16, 27], "page": [0, 1, 4, 5, 16, 18, 21], "document": [0, 1, 4, 5, 8, 9, 16, 28], "requir": [0, 1, 4, 7, 8, 9, 12, 17, 19, 24, 25, 27], "To": [0, 1, 3, 4, 27], "creat": [0, 1, 3, 4, 7, 9, 10, 12, 16, 17, 28], "dataset": [0, 8, 9, 10, 12, 13, 15, 19, 24, 25], "we": [0, 1, 3, 4, 5, 15, 17, 19, 25], "us": [0, 2, 4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 21, 23, 24, 25, 26, 27], "present": [0, 18, 19], "command": [0, 3, 27], "from": [0, 4, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 27, 28], "folder": [0, 1, 4, 12, 17], "python": [0, 3, 4, 18, 28], "prepare_data": [0, 25], "py": [0, 3, 25, 26, 27], "p": [0, 1, 7, 19, 20, 21, 23, 27], "thi": [0, 1, 4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27], "script": [0, 3, 4, 16, 20, 23, 25], "take": [0, 1, 8, 9, 10, 15, 16, 25], "care": [0, 1], "download": [0, 1, 4, 13], "lwm": [0, 1, 4, 7, 13, 16, 17], "hipe": [0, 13, 16, 17, 24], "format": [0, 1, 4, 8, 15, 16, 17, 19, 24, 25], "them": [0, 1, 3, 4, 5, 8, 9, 12, 15, 16, 19, 27], "need": [0, 1, 3, 4, 7, 9, 16, 25, 26], "toponym_resolut": [0, 3, 28], "doe": [0, 1, 12, 19], "all": [0, 1, 3, 4, 7, 8, 15, 16, 17], "differ": [0, 1, 4, 7, 9, 15, 25], "scenario": 0, "report": 0, "approach": [0, 1, 4, 7, 9, 21, 24, 25], "tabl": [0, 4], "result": [0, 1, 4, 7, 8, 9, 12, 15, 17, 24], "one": [0, 1, 4, 8, 15, 16, 17], "provid": [0, 1, 4, 5, 7, 8, 9, 10, 12, 15, 17, 25, 26, 27], "go": 0, "There": [0, 15], "you": [0, 1, 3, 4, 8, 27, 28], "should": [0, 1, 4, 7, 15, 19, 23, 27, 28], "clone": [0, 3], "scorer": [0, 17, 24], "ar": [0, 1, 3, 4, 5, 8, 9, 10, 12, 15, 16, 17, 19, 20, 21, 25, 26, 27], "code": [0, 1, 4, 5, 16, 21, 23], "version": [0, 3, 4, 9], "commit": [0, 2, 5], "50dff4e": 0, "have": [0, 1, 3, 4, 17, 19, 20], "ad": [0, 1, 8, 17, 23, 25], "line": [0, 1, 4], "return": [0, 1, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 21, 23, 24, 25], "eval_stat": 0, "end": [0, 4, 5, 8, 10, 15, 16, 17, 24], "get_result": 0, "function": [0, 7, 8, 10, 12, 15, 16, 17, 18, 19, 21, 24, 25], "latex": 0, "display_result": 0, "t": [1, 2, 4, 15, 17], "re": [1, 2, 4], "ha": [1, 4, 5, 8, 9, 17], "three": [1, 4, 5, 8, 15, 17, 24], "main": [1, 5], "class": [1, 4, 5, 7, 8, 9, 10, 16, 19, 21, 23], "which": [1, 3, 4, 7, 8, 9, 15, 17, 19, 23, 24, 26, 27], "perform": [1, 4, 5, 7, 8, 9, 10, 15, 16, 17, 18, 19, 24, 25], "toponym": [1, 2, 8, 10, 16, 28], "recognit": [1, 2, 5, 8, 10, 15, 16, 17], "i": [1, 3, 4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 21, 23, 24, 25, 27, 28], "name": [1, 3, 4, 5, 8, 9, 10, 15, 16, 17, 21, 25, 27], "task": [1, 4, 7, 8, 10, 19], "select": [1, 3, 4, 5, 7, 8, 9, 12], "rank": [1, 5, 6, 7, 8, 11, 17, 21, 25], "identifi": [1, 4, 5, 8, 9, 15, 16, 18, 24, 25], "most": [1, 4, 7, 19], "like": [1, 4, 8, 17, 19], "those": [1, 9, 15], "addit": [1, 10], "wrap": [1, 7], "compon": [1, 5], "therefor": [1, 4], "make": [1, 3, 4, 18], "easier": 1, "user": [1, 3, 4, 26], "link": [1, 4, 5, 6, 8, 9, 11, 15, 16, 17, 18, 21, 24, 25], "In": [1, 3, 4], "section": [1, 4, 27, 28], "depth": 1, "each": [1, 4, 5, 8, 9, 10, 15, 16, 17, 19, 24, 25], "four": [1, 4], "start": [1, 4, 5, 8, 10, 15, 16, 17, 24], "other": [1, 4], "refer": [1, 4, 5, 7, 9, 19, 20, 21, 23], "learn": [1, 4, 19], "more": [1, 4, 7, 9, 17, 23, 28], "about": [1, 4, 9, 17, 25], "also": [1, 3, 4, 5, 8, 9, 10, 15, 17, 26], "first": [1, 3, 4, 8, 9, 12, 15, 17], "try": [1, 8], "run": [1, 3, 4, 5, 7, 8, 9, 10, 24, 27], "default": [1, 4, 7, 8, 9, 10, 12, 17, 19, 23, 25, 27, 28], "chang": [1, 3, 4, 15], "accordingli": [1, 9], "your": [1, 3, 4, 5, 8, 16, 24, 26, 28], "note": [1, 4, 7, 8, 9], "befor": [1, 4, 25, 27], "being": 1, "abl": 1, "sure": [1, 3, 4, 28], "object": [1, 4, 7, 8, 9, 10, 12, 15, 17, 23, 24, 25], "By": [1, 4, 8], "huggingfac": [1, 4, 10, 15], "do": [1, 3, 4], "geopars": [1, 4, 5, 11, 17, 25, 27], "import": [1, 4, 27], "resources_path": [1, 7, 8, 9], "updat": [1, 2, 5, 9, 16, 17, 27], "path": [1, 3, 4, 7, 8, 9, 10, 12, 13, 16, 17, 18, 24, 27], "argument": [1, 10, 12], "reflect": 1, "set": [1, 2, 5, 7, 8, 9, 10, 12, 15, 16, 17, 19, 21, 24, 25, 27], "up": [1, 3, 4, 8, 12, 27], "can": [1, 3, 4, 8, 15, 25, 26, 27], "customis": 1, "see": [1, 4, 7, 8, 9, 15, 17, 19, 20, 21, 23, 28], "order": [1, 3, 4, 7, 16, 17], "just": 1, "beforehand": 1, "pass": [1, 8, 9, 12, 16, 21, 25], "myner": [1, 8, 17], "myrank": [1, 7, 8, 9, 12, 25], "mylink": [1, 7, 8, 25], "expect": [1, 4, 16], "experi": [1, 3, 4, 5, 7, 8, 16, 17, 24, 25, 27], "exampl": [1, 4, 7, 8, 9, 10, 15, 16, 17, 18, 26, 27, 28], "ani": [1, 8, 10, 17, 18, 25], "same": [1, 4, 15, 16, 17], "level": [1, 8, 16], "look": [1, 17], "right": 1, "locat": [1, 4, 7, 12, 16], "If": [1, 3, 4, 7, 8, 9, 10, 12, 15, 16, 17, 18, 25, 28], "itself": [1, 9], "time": [1, 4, 8, 19], "certain": [1, 7], "input": [1, 5, 8, 10, 12, 15, 24, 25], "paramet": [1, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 21, 23, 24, 25], "long": [1, 21], "readi": 1, "predict": [1, 4, 7, 8, 9, 10, 15, 17, 19, 24], "so": 1, "onc": 1, "been": [1, 3, 4, 5, 8, 9, 17, 20], "text": [1, 4, 5, 7, 8, 9, 10, 15, 16, 17, 18, 19, 24, 28], "individu": 1, "list": [1, 4, 8, 9, 10, 12, 15, 16, 17, 19, 24, 25], "two": [1, 4, 8, 9, 12, 15, 16, 17], "run_sent": [1, 8], "method": [1, 4, 7, 8, 9, 10, 17, 19, 21, 23], "appli": [1, 4, 7, 19], "run_text": [1, 8], "split": [1, 4, 7, 8, 9, 15, 17, 24, 25], "sentenc": [1, 4, 8, 9, 10, 15, 16, 17, 24, 28], "inspector": 1, "liddl": 1, "said": 1, "am": 1, "polic": 1, "live": [1, 3, 10], "citi": [1, 4, 8, 16, 18], "durham": 1, "both": [1, 4, 15, 19, 25], "case": [1, 4, 15, 17, 25], "todo": 1, "docstr": 1, "place": [1, 4, 5, 7, 8, 16, 17, 28], "public": [1, 4, 7, 8, 16, 17, 25], "associ": [1, 7, 8, 16, 21, 23], "human": [1, 8], "legibl": [1, 8], "e": [1, 3, 4, 7, 8, 9, 12, 15, 16, 17, 18, 19, 24, 25, 27], "g": [1, 4, 7, 8, 9, 15, 16, 17, 18, 21, 24, 25], "london": [1, 4, 8, 9, 10, 28], "place_wqid": [1, 4, 8, 17, 28], "wikidata": [1, 2, 5, 7, 8, 9, 12, 16, 17, 18, 24, 25], "id": [1, 4, 5, 7, 8, 9, 12, 15, 16, 17, 18, 23, 24, 25], "q84": [1, 4, 7, 8, 9], "For": [1, 4, 7, 8, 9, 10, 15, 17], "alston": 1, "cumbria": 1, "england": [1, 4], "q2560190": 1, "ner_scor": [1, 8, 15, 17], "0": [1, 4, 8, 9, 10, 15, 16, 17, 19, 23, 27, 28], "999": 1, "po": [1, 8], "74": 1, "sent_idx": [1, 8], "end_po": [1, 8], "80": [1, 27], "tag": [1, 8, 15, 16, 17], "loc": [1, 4, 15, 16, 17], "q179815": 1, "ed_scor": [1, 8], "039": 1, "cross_cand_scor": [1, 8], "396": 1, "q23082": 1, "327": 1, "q49229": 1, "141": 1, "q5316459": 1, "049": 1, "q458393": 1, "045": 1, "q17003433": 1, "042": 1, "q1075483": 1, "string_match_scor": [1, 8], "q1137286": 1, "q5316477": 1, "q752266": 1, "prior_cand_scor": [1, 8], "881": 1, "522": 1, "457": 1, "455": 1, "313": 1, "295": 1, "293": 1, "latlon": [1, 8], "54": 1, "783333": 1, "566667": 1, "wkdt_class": [1, 8], "q515": [1, 4], "how": [1, 2, 4, 5, 8, 28], "run_text_recognit": [1, 8], "context": [1, 7, 8, 19, 21], "gold": [1, 8, 15, 17, 21, 24, 25], "none": [1, 4, 7, 8, 9, 10, 12, 13, 16, 17, 18, 19, 21, 23, 24, 25], "ngram": [1, 8], "conf_md": [1, 8], "previou": [1, 8, 15, 16, 24, 28], "ner_output": 1, "cand": 1, "run_candidate_select": [1, 8], "score": [1, 4, 7, 8, 9, 10, 15, 16, 17, 19, 21], "022222222222222223": 1, "3157894736842105": 1, "013513513513513514": 1, "06484443152079093": 1, "final": [1, 4, 8], "disamb_output": 1, "run_disambigu": [1, 8], "exact": [1, 9], "wise": 1, "manner": 1, "regardless": 1, "field": [1, 4, 15, 25], "confid": [1, 7, 8, 10, 19], "posit": [1, 4, 8, 10, 12, 15, 16, 17, 24], "index": [1, 4, 5, 8, 15], "label": [1, 4, 8, 10, 15, 16, 17, 27], "qid": [1, 4, 24], "nil": [1, 7, 24], "A": [1, 4, 7, 8, 9, 10, 12, 15, 16, 17, 19, 23, 24, 25], "dictionari": [1, 4, 7, 8, 9, 10, 12, 15, 16, 17, 24, 25, 27], "match": [1, 4, 8, 9, 12, 16, 24], "prior": [1, 8], "cross": [1, 8], "latitud": [1, 4, 8], "longitud": [1, 4, 8], "coordin": [1, 4, 5, 8], "get": [1, 5, 19, 23, 28], "its": [1, 4, 8, 9, 12, 16, 17, 19, 21, 23], "significantli": 1, "less": 1, "complex": 1, "than": [1, 19], "better": 1, "mai": [1, 15], "bad": 1, "plan": 1, "modern": 1, "global": [1, 3], "clean": 1, "data": [1, 2, 5, 7, 9, 12, 16, 17, 18, 19, 25], "howev": 1, "account": [1, 16], "agnost": 1, "often": 1, "quantitav": 1, "quit": 1, "well": [1, 4], "becaus": [1, 15, 17, 19], "higher": 1, "probabl": 1, "common": [1, 4, 5], "sens": 1, "appear": [1, 4], "consider": 1, "longer": 1, "want": [1, 3, 4], "few": [1, 3], "larg": 1, "number": [1, 4, 9, 16, 23], "done": 1, "effici": 1, "save": [1, 4, 10, 12, 24], "lot": 1, "obtain": [1, 4, 5, 10, 12], "uniqu": [1, 16], "full": [1, 8, 17, 24], "per": [1, 4, 16, 17, 19, 21, 24], "basi": [1, 17], "assum": [1, 4, 16], "csv": 1, "row": [1, 4, 9, 16, 17], "df": [1, 4, 17, 25], "pd": [1, 4, 9, 17], "read_pickl": 1, "1880": 1, "1900": 1, "hmd": 1, "subsampl": 1, "wikidata_id": [1, 4], "find": [1, 4, 8, 9, 24], "datafram": [1, 16, 17, 25], "nlp_df": 1, "identified_toponym": 1, "progress_appli": 1, "lambda": 1, "x": [1, 28], "axi": 1, "whole": 1, "all_toponym": 1, "item": 1, "l": [1, 16], "all_cand": 1, "back": [1, 4], "top": [1, 4, 12, 16], "geograph": [1, 4, 5, 7, 8], "": [1, 4, 7, 8, 9, 15, 27, 28], "transform": [1, 10, 15], "librari": [1, 3, 4], "either": [1, 4, 15, 27], "directli": [1, 3, 4], "hub": [1, 4, 10], "local": [1, 4, 10, 21, 26], "store": [1, 4, 9, 10, 12, 17, 23, 24, 25, 27], "fine": [1, 10, 16], "tune": [1, 10], "new": [1, 4, 8, 12, 15, 16, 18, 25, 27], "base": [1, 2, 5, 7, 9, 10, 12, 15, 16, 17, 19, 21, 23, 24], "alreadi": [1, 4, 7, 8, 9, 10], "pre": [1, 2, 4, 5, 10], "notebook": [1, 3, 28], "detect": [1, 4, 15, 16, 17], "train_use_ner_model": 1, "ipynb": [1, 28], "load_use_ner_model": 1, "load_from_hub": [1, 8, 10], "true": [1, 4, 7, 8, 9, 10, 12, 15, 27], "livingwithmachin": [1, 8], "19thc": [1, 4, 8], "en": [1, 8, 16, 18], "initialis": [1, 7, 8, 9, 10], "wai": [1, 17], "let": 1, "suppos": 1, "rel": [1, 4, 5, 7, 9, 11, 14, 24, 25], "blb_lwm": 1, "could": [1, 4, 18], "notic": [1, 7, 19, 20, 21, 23], "still": 1, "would": [1, 18, 19], "load_from_path": 1, "altern": [1, 4, 9], "below": [1, 4, 7, 8, 9, 12], "train_dataset": [1, 10], "ner_fine_train": [1, 4], "json": [1, 10, 17, 25, 28], "test_dataset": [1, 10], "ner_fine_dev": [1, 4], "base_model": [1, 10], "bert_1760_1900": 1, "model_path": [1, 7, 10], "training_arg": [1, 10], "batch_siz": [1, 10], "8": [1, 10, 16], "num_train_epoch": [1, 10], "10": [1, 4, 10, 16], "learning_r": [1, 10], "00005": [1, 10], "weight_decai": [1, 10], "overwrite_train": [1, 7, 9, 10, 12], "fals": [1, 4, 7, 8, 9, 10, 18, 19, 23], "do_test": [1, 7, 9, 10], "indic": [1, 4, 7, 9, 16, 23], "whether": [1, 4, 7, 8, 9, 10, 17, 23], "prepar": [1, 5, 17, 25], "unless": [1, 17], "even": 1, "specifi": [1, 4, 7, 8, 9, 10, 12, 23, 25], "bert": [1, 4, 10, 17], "nineteenth": 1, "centuri": [1, 4], "test": [1, 3, 4, 7, 9, 10, 17, 25, 27], "necessari": [1, 4, 7, 21], "inform": [1, 4, 7, 9, 15, 16, 17, 19, 20, 21, 23, 24, 25], "where": [1, 4, 9, 12, 13, 15, 17, 19, 24], "rate": 1, "batch": [1, 21], "size": [1, 23], "epoch": 1, "weight": 1, "decai": 1, "allow": [1, 4, 26], "mock": 1, "suffix": [1, 17], "_test": 1, "load_to_hub": 1, "skip": [1, 3, 4, 7, 9, 10], "call": [1, 4, 9, 12, 15, 27], "taken": [1, 19, 20, 21, 23], "knowledg": [1, 7, 9, 19, 24], "accord": [1, 17, 27, 28], "similar": [1, 8, 9, 12, 17], "target": [1, 8], "subset": [1, 8], "next": [1, 4, 8], "gazett": [1, 16, 24], "combin": [1, 5, 15], "wikipedia": [1, 2, 5, 7, 9, 12, 16, 18, 24], "strategi": 1, "ident": 1, "wiltshir": [1, 4], "q23183": [1, 4], "q55448990": [1, 4], "q8023421": [1, 4], "anchor": [1, 4], "partial": [1, 9], "between": [1, 4, 9, 12, 19, 21], "queri": [1, 9, 25, 28], "overlap": [1, 9, 17], "ashton": [1, 4, 15], "under": [1, 4, 15], "lyne": [1, 4, 15], "fuzzi": [1, 4, 9], "distanc": [1, 7, 9], "wiltshrr": 1, "accur": 1, "when": [1, 4, 7, 8, 9, 10, 15, 17, 21], "come": [1, 4, 8], "ocr": [1, 4, 10, 12, 16], "variat": [1, 4, 9, 12], "veri": 1, "slow": 1, "embed": [1, 2, 5, 7, 12, 21, 23, 25], "It": [1, 7, 8, 9, 10, 12, 15, 19, 25], "hour": 1, "fastest": 1, "except": [1, 4], "respect": [1, 10, 17], "contain": [1, 4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 24, 25, 26], "describ": [1, 4, 5, 7], "trickier": 1, "ideal": 1, "captur": 1, "type": [1, 4, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 23, 24, 25, 28], "found": [1, 4, 8, 9, 18, 23, 24, 25], "errror": 1, "train_use_deezy_model_1": 1, "train_use_deezy_model_2": 1, "train_use_deezy_model_3": 1, "detail": [1, 7, 9, 15], "file": [1, 3, 4, 12, 16, 17, 24, 25, 26, 27], "w2v_ocr_pair": [1, 4, 12], "txt": [1, 12], "characters_v001": 1, "vocab": 1, "input_dfm": [1, 12], "yaml": [1, 12], "news_dataset": [1, 4], "mentions_to_wikidata_norm": 1, "wikidata_to_mentions_norm": 1, "pathlib": 1, "strvar_paramet": [1, 9, 12], "dict": [1, 7, 8, 9, 10, 12, 15, 16, 17, 24, 25], "deezy_paramet": [1, 9, 12], "filenam": 1, "dm_path": [1, 9, 12], "str": [1, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 23, 24, 25], "resolv": [1, 4, 9, 19, 28], "dm_cand": [1, 9], "wkdtalt": [1, 9], "dm_model": [1, 9], "w2v_ocr": [1, 9], "dm_output": [1, 9], "deezymatch_on_the_fli": [1, 9], "measur": [1, 4], "ranking_metr": [1, 9], "faiss": [1, 9], "selection_threshold": [1, 9], "50": [1, 9], "num_candid": [1, 9], "verbos": [1, 9], "readm": 1, "left": [1, 4], "empti": [1, 4, 7, 8, 9, 15, 16, 17, 25], "sinc": 1, "realli": 1, "metric": [1, 9, 10], "vector": [1, 4, 12, 21], "threshold": [1, 9, 12], "maximum": [1, 17], "overwrit": [1, 7, 10], "mode": [1, 7, 9, 10], "w2v": [1, 4, 9], "w2v_1800s_new": 1, "syn1neg": [1, 4], "npy": [1, 4], "wv": [1, 4], "w2v_1860s_new": 1, "ocr_threshold": [1, 9], "60": [1, 9], "top_threshold": [1, 9], "85": [1, 9], "min_len": [1, 9], "5": [1, 4, 9], "max_len": [1, 9], "15": [1, 9, 16], "w2v_ocr_path": [1, 9], "w2v_ocr_model": [1, 9], "w2v_": [1, 9], "_new": [1, 9], "overwrite_dataset": [1, 9], "fuzzywuzzi": 1, "ratio": [1, 12, 19], "consid": [1, 12, 15], "neg": [1, 4, 12], "minimum": 1, "length": [1, 4], "word": [1, 2, 5, 10, 12, 15, 16, 17, 21, 23, 25], "word2vec": [1, 12], "regular": [1, 21], "express": [1, 17], "wikidata_norm": 1, "thei": [1, 4, 9, 17, 18], "mentions_to_wikidata": [1, 9, 12, 25], "load_resourc": [1, 7, 9], "model_state_dict": 1, "find_candid": [1, 8, 9], "kei": [1, 4, 7, 8, 9, 10, 15, 16, 17, 24, 25], "alwai": [1, 4, 27], "valu": [1, 4, 7, 9, 15, 16, 17, 24, 25], "question": [1, 4], "mancheft": 1, "print": [1, 8, 9, 10, 16], "best": 1, "depend": [1, 3, 9], "unsupervis": [1, 4, 7], "popular": [1, 7], "term": [1, 21], "inlink": 1, "implement": [1, 9, 21, 25], "ment": 1, "norm": 1, "algorithm": 1, "propos": 1, "le": [1, 21], "titov": [1, 21], "2018": [1, 21], "ganea": [1, 21], "hofmann": [1, 21], "2017": [1, 21], "adapt": [1, 4, 7, 10, 15, 18, 19, 20, 21, 26], "know": 1, "van": [1, 7, 19, 20, 21, 23], "hulst": [1, 7, 19, 20, 21, 23], "johann": [1, 7, 19, 20, 21, 23], "m": [1, 7, 19, 20, 21, 23], "faegheh": [1, 7, 19, 20, 21, 23], "hasibi": [1, 7, 19, 20, 21, 23], "koen": [1, 7, 19, 20, 21, 23], "dercksen": [1, 7, 19, 20, 21, 23], "krisztian": [1, 7, 19, 20, 21, 23], "balog": [1, 7, 19, 20, 21, 23], "arjen": [1, 7, 19, 20, 21, 23], "de": [1, 7, 19, 20, 21, 23], "vri": [1, 7, 19, 20, 21, 23], "stand": [1, 7, 19, 20, 21, 23], "shoulder": [1, 7, 19, 20, 21, 23], "giant": [1, 7, 19, 20, 21, 23], "proceed": [1, 7, 19, 20, 21, 23, 27], "43rd": [1, 7, 19, 20, 21, 23], "intern": [1, 7, 19, 20, 21, 23], "acm": [1, 7, 19, 20, 21, 23], "sigir": [1, 7, 19, 20, 21, 23], "confer": [1, 7, 19, 20, 21, 23], "research": [1, 4, 7, 19, 20, 21, 23], "develop": [1, 4, 7, 19, 20, 21, 23], "pp": 1, "2197": 1, "2200": 1, "2020": [1, 7, 17, 19, 20, 21, 23], "phong": [1, 21], "ivan": [1, 21], "improv": [1, 4, 21], "latent": [1, 21], "relat": [1, 8, 9, 10, 21], "56th": [1, 21], "annual": [1, 21], "meet": [1, 21], "comput": [1, 9, 10, 19, 21], "linguist": [1, 21], "volum": [1, 21, 27], "1595": [1, 21], "1604": [1, 21], "octavian": [1, 21], "eugen": [1, 21], "thoma": [1, 21, 28], "deep": [1, 9, 19, 21], "joint": [1, 21], "neural": [1, 9, 21], "attent": [1, 21], "empir": [1, 21], "natur": [1, 8, 21], "languag": [1, 8, 12, 21], "process": [1, 4, 7, 8, 9, 10, 15, 16, 17, 21], "2619": [1, 21], "2629": [1, 21], "least": [1, 4], "entity2class": 1, "wikidata_gazett": 1, "sqlite3": [1, 4, 7], "connect": [1, 4, 7, 15, 25], "rel_db": [1, 4, 7], "embeddings_databas": [1, 4, 7], "db": [1, 4, 7, 18, 24], "conn": [1, 4, 7], "cursor": [1, 4, 7, 25], "rel_param": [1, 7, 8, 25], "data_path": [1, 7], "training_split": [1, 7], "originalsplit": [1, 4, 7, 24], "db_embed": [1, 7], "with_publ": [1, 7], "without_microtoponym": [1, 7, 8], "default_publnam": [1, 7], "default_publwqid": [1, 7], "specif": [1, 15, 16, 19, 24], "linking_df_split": [1, 4], "tsv": [1, 4, 16, 17, 24], "column": [1, 4, 9, 16], "databas": [1, 4, 7, 24, 25], "featur": 1, "filter": [1, 9, 12], "out": [1, 4], "microtoponym": [1, 8], "overrid": 1, "ignor": [1, 19], "As": 1, "infer": 1, "characterist": 1, "linking_resourc": [1, 7], "ed_model": 1, "train_load_model": [1, 7], "self": [1, 24], "whose": [1, 21, 25], "instal": [2, 5, 26, 27], "system": [2, 5, 9], "pyenv": [2, 5], "poetri": [2, 5, 27], "project": [2, 5], "hoook": [2, 5], "resourc": [2, 3, 5, 7, 8, 9, 16, 24, 27], "disambigu": [2, 5, 7, 8, 19, 21, 25], "train": [2, 5, 7, 9, 10, 12, 15, 16, 19, 21, 25], "entiti": [2, 5, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25], "deezymatch": [2, 5, 9, 12], "summari": [2, 5], "The": [2, 4, 5, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 20, 21, 23, 24, 25, 26, 27, 28], "complet": [2, 5, 16], "tour": [2, 5], "pipelin": [2, 4, 6, 10, 11, 17, 26, 27], "recognis": [2, 5, 6, 8, 11, 17, 27], "ranker": [2, 5, 6, 7, 8, 11, 12, 25, 27], "linker": [2, 4, 5, 6, 8, 11, 19, 20, 21, 23, 27], "work": [3, 4, 5], "codebas": 3, "suggest": 3, "linux": 3, "ubuntu": 3, "20": [3, 7, 18, 19, 20, 21, 23, 28], "04": 3, "date": 3, "essenti": 3, "sudo": [3, 27], "apt": 3, "build": [3, 4, 5, 17, 26], "curl": [3, 28], "libbz2": 3, "dev": [3, 4], "libffi": 3, "liblzma": 3, "libncursesw5": 3, "libreadlin": 3, "libsqlite3": 3, "libssl": 3, "libxml2": 3, "libxmlsec1": 3, "llvm": 3, "tk": 3, "wget": 3, "xz": 3, "util": [3, 4, 5, 11, 27], "zlib1g": 3, "Then": 3, "manag": [3, 7], "virtual": 3, "environ": [3, 27], "http": [3, 12, 16, 17, 18, 27, 28], "bash": [3, 27], "And": 3, "properli": 3, "export": [3, 27], "echo": 3, "pyenv_root": 3, "home": [3, 27], "bashrc": 3, "bin": 3, "v": [3, 27], "1": [3, 5, 8, 9, 15, 17, 19, 21, 26], "null": 3, "2": [3, 5, 17, 26], "n": [3, 10, 12], "eval": 3, "init": 3, "nfi": 3, "restart": [3, 27], "session": [3, 4], "sourc": [3, 16], "environemnt": 3, "3": [3, 4, 5, 7, 16, 17, 19, 21, 26], "9": [3, 4], "7": [3, 4, 21], "dipend": 3, "across": [3, 9], "ssl": 3, "org": [3, 16, 18], "python3": 3, "now": [3, 4, 8, 17, 28], "repo": 3, "cd": 3, "git": 3, "github": [3, 4, 7, 12, 17, 18, 19, 20, 21, 23], "com": [3, 12, 17, 18], "machin": 3, "explicitli": 3, "tell": 3, "defin": [3, 12], "abov": [3, 4, 7, 17], "env": 3, "kernel": 3, "ipython": 3, "kernel_nam": 3, "activ": 3, "shell": 3, "usual": 3, "instanc": [3, 10, 15], "add": [3, 4, 21, 23, 25, 27], "packag": 3, "unit": [3, 4, 7], "pytest": 3, "integr": 3, "some": [3, 4, 15], "jupyt": 3, "guarante": 3, "style": 3, "consist": [3, 4, 12, 15, 16, 17, 18, 26], "basic": 3, "hook": 3, "sever": 4, "load": [4, 7, 9, 10, 27], "web": 4, "gener": [4, 9, 12, 16, 25, 28], "give": 4, "option": [4, 7, 8, 9, 10, 12, 16, 17, 18, 23, 25, 27], "own": [4, 17, 26], "an": [4, 5, 7, 8, 9, 10, 12, 15, 16, 17, 19, 20, 21, 23, 25], "focus": 4, "english": [4, 12], "topres19th": 4, "british": 4, "repositori": [4, 7, 13, 19, 20, 21, 23, 26, 27], "output": [4, 7, 8, 12, 17, 24], "ner_fine_test": 4, "continu": 4, "read": [4, 16], "descript": [4, 15], "don": 4, "ner": [4, 5, 8, 10, 11, 14, 16, 17], "model": [4, 7, 8, 9, 10, 12, 15, 16, 17, 19, 21, 25], "otherwis": [4, 9, 10, 17, 18], "correspond": [4, 5, 7, 8, 9, 12, 15, 16, 17, 18, 19, 24, 25], "pair": [4, 9, 12, 17], "string": [4, 8, 9, 12, 15, 16, 17, 24, 25], "token": [4, 10, 12, 15, 16, 17, 23, 24], "ner_tag": [4, 15, 16], "annot": [4, 16, 17, 24], "bio": [4, 16], "3896239_29": 4, "o": [4, 10, 15, 16, 17], "b": [4, 15, 16, 17], "street": 4, "old": 4, "millgat": 4, "collegi": 4, "church": 4, "arriv": 4, "littl": 4, "after": [4, 8, 16, 18], "ten": 4, "oclock": 4, "8262498_11": 4, "On": 4, "jsth": 4, "novemb": 4, "ship": 4, "santo": 4, "christo": 4, "monteveido": 4, "cadiz": 4, "hide": 4, "copper": 4, "10715509_7": 4, "coach": 4, "southampton": 4, "everi": 4, "morn": 4, "quarter": 4, "6": 4, "sundai": 4, "automat": [4, 28], "won": 4, "mostpopular": [4, 7, 8], "ed": [4, 19], "singl": [4, 8, 15, 28], "article_id": [4, 16, 17], "articl": [4, 16, 17, 24, 25], "origin": [4, 7, 8, 9, 15, 16, 17, 21, 23, 25], "1218_poole1860": 4, "1218": 4, "sentence_po": [4, 17], "sentence_text": 4, "dukinfield": 4, "knutsford": 4, "servant": 4, "girl": 4, "eliza": 4, "ann": 4, "byrom": 4, "who": 4, "stole": 4, "quantiti": 4, "cloth": 4, "hous": 4, "she": 4, "lodg": 4, "dukiafield": 4, "wa": [4, 7, 9, 10], "month": 4, "imprison": 4, "mention": [4, 5, 7, 8, 9, 12, 15, 17, 19, 21, 25], "mention_po": 4, "actual": [4, 16], "entity_typ": [4, 15], "wkdt_qid": [4, 16], "mention_start": 4, "charact": [4, 10, 15, 16, 17, 18, 24], "mention_end": 4, "sent_po": 4, "q1976179": 4, "q1470791": 4, "104": 4, "114": 4, "newspap": [4, 10], "belong": [4, 16], "manchest": [4, 28], "uk": 4, "leav": 4, "left_out": 4, "maintain": [4, 15], "66": 4, "33": 4, "divid": 4, "withouttest": 4, "seri": [4, 7, 9, 17, 19, 20, 21, 23], "These": 4, "wiki2gaz": 4, "soon": 4, "built": [4, 27], "content": [4, 28], "prefer": 4, "mean": [4, 16], "inner": [4, 17, 24], "absolut": 4, "count": [4, 25], "particular": 4, "access": [4, 8, 26], "open": [4, 25], "r": 4, "f": [4, 21, 27], "assign": [4, 7, 9, 15, 16, 17], "4457": 4, "map": [4, 8, 9, 12, 15, 16, 17, 18, 24, 25], "through": [4, 8, 9, 17], "normalis": [4, 8, 23], "9767696690773614": 4, "03125": 4, "frequenc": [4, 9], "005478851632697786": 4, "wilton": 4, "00021915406530791147": 4, "colleg": 4, "council": 4, "0015340784571553803": 4, "west": 4, "north": 4, "wilt": 4, "counti": 4, "0026298487836949377": 4, "010081087004163929": 4, "kingdom": [4, 7], "plain": [4, 24], "97": 4, "show": [4, 5, 8, 9, 28], "five": [4, 17], "panda": [4, 9, 16, 17, 25], "read_csv": 4, "head": 4, "english_label": 4, "q5059107": 4, "centenni": 4, "40": [4, 17], "01140": 4, "87": 4, "24330": 4, "q5059144": 4, "ground": [4, 19], "39": 4, "99270": 4, "75": [4, 9], "19380": 4, "q5059153": 4, "high": [4, 19], "school": 4, "06170": 4, "83": 4, "05780": 4, "q5059162": 4, "38": [4, 17], "30440": 4, "63800": 4, "4": [4, 5, 10, 17, 19, 26], "q5059178": 4, "memori": 4, "samsung": 4, "hall": 4, "37": 4, "58949": 4, "127": 4, "03434": 4, "onli": [4, 15, 17], "post": [4, 8], "render": 4, "dummi": 4, "float": [4, 7, 8, 9, 12, 17, 19], "interest": 4, "affect": 4, "likewis": 4, "q180673": 4, "cerimoni": 4, "wherea": 4, "complain": 4, "resolut": [4, 16], "wiki2vec": 4, "entity_embed": 4, "lower": [4, 18, 23], "preced": 4, "wildcard": 4, "unk": [4, 23], "emb": 4, "deriv": 4, "share": 4, "toward": 4, "meanwhil": 4, "pleas": 4, "index_enwiki": [4, 24], "latest": [4, 24, 27], "arrai": [4, 25], "execut": [4, 7, 9, 10], "lerwick": 4, "fetchon": 4, "els": 4, "tolist": 4, "3257000148296356": 4, "00989999994635582": 4, "13420000672340393": 4, "014700000174343586": 4, "007899999618530273": 4, "1808999925851822": 4, "candid": [4, 5, 7, 8, 9, 12, 17, 19, 21, 25], "perfectmatch": [4, 8, 9], "digitis": 4, "word1": 4, "word2": 4, "boolean": 4, "might": 4, "wish": 4, "hardli": 4, "didnot": 4, "never": 4, "reus": 4, "coeld": 4, "conld": 4, "couid": 4, "histor": [4, 10], "nois": [4, 9], "expand": 4, "extract": [4, 8, 9, 17, 25], "085": 4, "514": 4, "billion": 4, "corpu": 4, "19th": 4, "zenodo": 4, "period": 4, "year": [4, 7, 16, 17, 19, 20, 21, 23], "w2v_xxxxs_new": 4, "xxxx": 4, "decad": [4, 16], "1800": 4, "1810": 4, "w2v_1800_new": 4, "w2v_1810_new": 4, "tutori": [4, 10], "app": [4, 26, 27, 28], "evalu": [4, 5, 10, 17], "wikidta_gazett": 4, "mark": 4, "asterisk": 4, "instanti": [4, 7, 8, 27], "plu": 4, "sign": 4, "given": [5, 7, 8, 9, 10, 12, 16, 17, 18, 19, 21, 23, 25], "design": 5, "tackl": 5, "problem": [5, 15], "deploi": [5, 28], "api": [5, 24], "element": [5, 15, 16, 17, 24, 25], "modul": [5, 11], "t_re": [5, 6, 11, 14, 20], "deezy_process": [5, 11, 14], "get_data": [5, 11, 14], "preprocess_data": [5, 11, 14], "process_data": [5, 11, 14], "process_wikipedia": [5, 11, 14], "rel_e2": [5, 11, 14], "rel_util": [5, 11, 14], "multipl": [5, 15, 26], "via": [5, 26, 28], "docker": [5, 26], "compos": [5, 26], "configur": [5, 26], "deploy": [5, 26], "extern": 5, "search": 5, "random_se": [6, 7, 19, 20], "liter": [7, 9, 15, 25], "reldisamb": [7, 25], "bydist": 7, "experiments_path": [7, 8], "bool": [7, 8, 9, 10, 17, 18, 21, 23], "flag": [7, 9, 23], "radboud": [7, 19, 20, 21, 23], "entityt": 7, "establish": 7, "wpubl": [7, 27, 28], "wmtop": [7, 27, 28], "q145": 7, "by_dist": 7, "dict_ment": 7, "origin_wqid": 7, "tupl": [7, 8, 9, 12, 15, 16, 17, 24], "relev": [7, 8, 9], "calcul": [7, 9], "closest": 7, "round": 7, "decim": 7, "undertak": 7, "most_popular": 7, "determin": 7, "entitydisambigu": [7, 19, 20], "entity_disambigu": [7, 14, 20], "exist": [7, 9, 10, 12, 16, 18, 19], "initi": [7, 10, 17, 25], "had": 7, "credit": [7, 10, 15, 18, 19, 21, 23], "copyright": [7, 19, 20, 21, 23], "c": [7, 19, 20, 21, 23], "michael": [7, 19, 20, 21, 23], "permiss": [7, 19, 20, 21, 23], "inproceed": [7, 19, 20, 21, 23], "vanhulst": [7, 19, 20, 21, 23], "author": [7, 19, 20, 21, 23], "titl": [7, 16, 18, 19, 20, 21, 23, 24], "booktitl": [7, 19, 20, 21, 23], "retriev": [7, 12, 17, 19, 20, 21, 23, 24, 25], "publish": [7, 19, 20, 21, 23], "42": [7, 19], "repres": [8, 9, 10, 15, 17, 23, 25], "includ": [8, 9, 10, 16, 20], "setup": 8, "visit": 8, "pari": 8, "york": [8, 16, 18], "last": [8, 18], "summer": 8, "processed_data": [8, 17], "format_predict": 8, "wk_cand": [8, 25], "int": [8, 12, 15, 17, 23, 24], "document_dataset": 8, "nest": [8, 15], "outermost": 8, "potenti": 8, "second": [8, 9, 12, 17], "salop": 8, "q201970": 8, "0006031363088057901": 8, "q23103": 8, "0075279261777561925": 8, "postprocess_output": 8, "larger": 8, "postprocess": [8, 17], "exclud": [8, 25], "dure": [8, 15, 17, 23], "along": [8, 16, 25], "run_sentence_recognit": 8, "entir": 8, "keyword": [8, 12], "futur": 8, "point": [8, 17, 25], "standard": [8, 16, 17, 24], "remain": [8, 9], "redund": 8, "partialmatch": 9, "levenshtein": 9, "wikidata_to_ment": [9, 12], "already_collected_cand": 9, "perfect": 9, "handl": [9, 10], "collect": [9, 15], "applic": [9, 28], "paraguai": 9, "already_collect": 9, "mention_candid": 9, "mention_already_collect": 9, "check_if_contain": 9, "amount": 9, "within": 9, "degre": 9, "rang": 9, "appl": 9, "delici": 9, "match_scor": 9, "3333333333333333": 9, "damlev_dist": 9, "damerau": 9, "etiti": 9, "lowercas": [9, 18, 23, 25], "normal": [9, 19, 23], "subtract": 9, "orang": 9, "1666666865348816": 9, "deezy_on_the_fli": 9, "network": [9, 21], "fly": 9, "attribut": [9, 10], "shefrield": 9, "sheffield": 9, "03382000000000005": 9, "perfectli": 9, "sub": 9, "guadaloup": 9, "sn83030483": [9, 17], "1790": [9, 17], "03": [9, 17], "31": 9, "i0004_1": 9, "q17012": 9, "003935458480913026": 9, "q3153836": 9, "07407407407407407": 9, "appropri": [9, 15], "chosen": 9, "addition": 9, "q2477346": 9, "remov": [9, 18], "pandarallel": 9, "partial_match": 9, "damlev": 9, "banana": 9, "perfect_match": 9, "altnam": 9, "check": 9, "vari": 9, "further": 9, "barcelona": 9, "bologna": 9, "deleg": 9, "pipe": 10, "5e": 10, "05": 10, "uncas": 10, "create_pipelin": 10, "ner_predict": [10, 17], "99975187": [10, 17], "dash": 10, "replac": [10, 15, 18, 23], "comma": 10, "pars": [10, 25], "issu": [10, 15], "align": [10, 15, 17], "trainer": 10, "obtain_match": [12, 14], "english_word": 12, "sim": 12, "fuzz_ratio_threshold": 12, "union": [12, 15], "70": 12, "classifi": 12, "100": 12, "nearest": 12, "neighbor": 12, "discard": 12, "thefuzz": 12, "fuzz": 12, "seatgeek": 12, "simpl": 12, "create_training_set": [12, 14], "neighbour": 12, "randomli": 12, "insid": [12, 16], "train_deezy_model": [12, 14], "generate_candid": [12, 14], "write": 12, "download_lwm_data": [13, 14], "news_path": 13, "bl": 13, "unzip": 13, "download_hipe_data": [13, 14], "hipe_path": [13, 16], "training_tokenize_and_align_label": [14, 15], "collect_named_ent": [14, 15], "aggregate_ment": [14, 15], "fix_capit": [14, 15], "fix_hyphen": [14, 15], "fix_nest": [14, 15], "fix_startent": [14, 15], "aggregate_ent": [14, 15], "turn_wikipedia2wikidata": [14, 16], "reconstruct_sent": [14, 16], "process_lwm_for_n": [14, 16], "process_lwm_for_link": [14, 16], "aggregate_hipe_ent": [14, 16], "process_hipe_for_link": [14, 16], "process_tsv": [14, 16], "fine_to_coars": [14, 16], "eval_with_except": [14, 17, 25], "prepare_s": [14, 17], "align_gold": [14, 17], "postprocess_predict": [14, 17], "ner_and_process": [14, 17], "update_with_link": [14, 17], "update_with_skylin": [14, 17], "prepare_storing_link": [14, 17], "store_for_scor": [14, 17], "make_wikilinks_consist": [14, 18], "make_wikipedia2wikidata_consis": [14, 18], "title_to_id": [14, 18], "rel_end_to_end": [14, 24], "get_rel_from_api": [14, 24], "match_wikipedia_to_wikidata": [14, 24], "match_ent": [14, 24], "postprocess_rel": [14, 24], "store_rel": [14, 24], "run_rel_experi": [14, 24], "get_db_emb": [14, 25], "prepare_initial_data": [14, 25], "rank_candid": [14, 25], "add_publ": [14, 25], "prepare_rel_trainset": [14, 25], "mulrel_rank": [14, 20], "vocabulari": [14, 20], "pretrainedtoken": 15, "pretrainedtokenizerfast": 15, "label_encoding_dict": 15, "encod": [15, 18], "label2id": 15, "tokenization_utils_bas": 15, "batchencod": 15, "namedtupl": 15, "iter": 15, "over": [15, 19], "keep": [15, 17], "start_char": [15, 17], "end_char": [15, 17], "reserv": 15, "offset": [15, 24], "e_typ": 15, "start_offset": [15, 17], "end_offset": [15, 17], "pred": [15, 17], "aggreg": [15, 16], "consolid": 15, "reconstruct": [15, 16], "white": [15, 16], "space": [15, 16, 18], "haven": [15, 17], "yet": [15, 17], "manual": [15, 17], "ner_label": [15, 17], "entity_link": [15, 17], "correct": [15, 28], "capit": 15, "error": [15, 17, 25], "occur": [15, 17], "incorrect": 15, "surfac": 15, "form": [15, 25], "lentiti": [15, 16], "fix": 15, "prefix": [15, 16, 25, 27], "hyphen": 15, "incorrectli": 15, "address": 15, "group": 15, "sequenc": 15, "regard": 15, "phrase": 15, "solut": 15, "current": [15, 16, 17], "part": [15, 16, 17, 18], "island": 15, "terceira": 15, "preposit": 15, "begin": 15, "instead": [15, 17, 26], "join": [15, 16], "wikipedia_titl": 16, "wikipedia_path": 16, "convert": [16, 18, 23, 25], "avail": [16, 21, 27, 28], "wiki": [16, 18], "colosseum": 16, "q10285": 16, "ancient_egypt": 16, "q11768": 16, "invalid_loc": 16, "warn": 16, "wikipedia2wikidata": [16, 18], "dtoken": 16, "ensur": 16, "tsv_topres_path": 16, "scheme": 16, "10813493_1": 16, "document_id": 16, "_": [16, 17], "sentence_id": [16, 17], "india": 16, "annotated_tsv": 16, "occurr": 16, "resources_dir": 16, "gazetteer_id": [16, 24], "ocr_quality_mean": [16, 17], "qualiti": 16, "ocr_quality_sd": [16, 17], "deviat": 16, "publication_titl": [16, 17], "publication_cod": [16, 17], "metadata": [16, 17], "consecut": 16, "ne_typ": 16, "q60": 16, "12": 16, "meto_typ": 16, "updated_ent": 16, "multi": [16, 21], "help": 16, "contigu": 16, "filepath": 16, "webanno": 16, "dmtoken": 16, "six": 16, "url": [16, 18], "grain": 16, "coars": 16, "equival": 16, "str2pars": [17, 25], "in_cas": [17, 25], "ast": 17, "literal_ev": 17, "succe": 17, "valueerror": 17, "success": [17, 25], "dsentenc": [17, 24], "10732214_1": 17, "10732214": 17, "unprocess": 17, "dannot": 17, "anoth": 17, "dmetadata": 17, "produc": [17, 21], "relabel": 17, "petr": 17, "q335322": 17, "gold_posit": 17, "later": 17, "tokenis": 17, "enabl": [17, 27], "assess": 17, "sentence_pr": 17, "represent": 17, "sentence_tru": 17, "sentence_ski": 17, "dpred": 17, "999826967716217": 17, "_ner_predict": 17, "dtrue": 17, "_gold_standard": 17, "dsky": 17, "skylin": 17, "At": 17, "without": 17, "fill": 17, "_ner_skylin": 17, "gold_token": [17, 24], "unitec": 17, "193": 17, "199": 17, "q30": 17, "_gold_posit": 17, "dmentionspr": 17, "i0001_9": 17, "state": 17, "206": 17, "79": 17, "_pred_ment": 17, "dmentionsgold": 17, "analog": 17, "link_predict": 17, "incorpor": 17, "test_df": 17, "all_test": 17, "end_to_end_ev": 17, "sky": 17, "achiev": 17, "choos": 17, "among": 17, "u": 17, "hipe_scorer_results_path": 17, "scenario_nam": 17, "dresult": 17, "articles_test": 17, "clef": 17, "conll": [17, 24], "impresso": 17, "oper": 18, "unquot": 18, "decod": 18, "percent": 18, "underscor": [18, 24], "fragment": 18, "symbol": 18, "quot": 18, "modifi": 18, "special": 18, "python_": 18, "programming_languag": 18, "overview": 18, "28program": 18, "20languag": 18, "29": 18, "data_sci": 18, "20scienc": 18, "san_francisco": 18, "san": 18, "20francisco": 18, "mapper": 18, "make_wikipedia2wikidata_consist": 18, "new_york_c": 18, "scienc": 18, "page_titl": 18, "path_to_db": [18, 24], "unescap": 18, "fermat": 18, "27s_last_theorem": 18, "s_last_theorem": 18, "wikidata2wikipedia": 18, "entri": 18, "manate": 18, "wiki_page_titl": 18, "jcklie": 18, "wikimapp": 18, "db_emb": 19, "user_config": 19, "reset_embed": 19, "architectur": 19, "mulrelrank": [19, 20, 21], "get_data_item": 19, "dname": 19, "respons": [19, 21], "trigger": 19, "prerank": [19, 20, 21], "normalize_scor": 19, "rescal": 19, "sum": 19, "truth": 19, "possibl": [19, 25], "max": 19, "org_train_dataset": 19, "org_dev_dataset": 19, "train_lr": 19, "train_json": 19, "dev_json": 19, "model_path_lr": 19, "lr": 19, "recal": 19, "low": 19, "corrrect": 19, "config": [21, 26, 27], "multipli": 21, "minim": 21, "forward": 21, "token_id": 21, "token_offset": 21, "entity_id": 21, "devic": 21, "mulrel": 21, "nel": 21, "ganea2017deep": 21, "le2018improv": 21, "tok_mask": 21, "entity_mask": 21, "p_e_m": 21, "ctx_layer": 21, "figur": 21, "ent_scor": 21, "q": 21, "score_combin": 21, "loss": 21, "true_po": 21, "lamb": 21, "1e": 21, "07": 21, "equat": 21, "max_norm": 21, "add_to_vocab": 23, "get_id": 23, "unknown": 23, "static": 23, "digit_0": 23, "rule": [23, 27], "digit": 23, "unk_token": 23, "sent": 24, "rel_end2end_path": 24, "wiki_titl": 24, "separ": 24, "pred_ent": 24, "prev_ann": 24, "rel_pr": 24, "wikigaz_id": 24, "retoken": 24, "drel": 24, "how_split": 24, "ashton1860": 24, "embtyp": 25, "snd": 25, "ndarrai": 25, "wikipedia2vec": 25, "np": 25, "preappend": 25, "rel_json": 25, "publnam": 25, "publwqid": 25, "dsplit": 25, "fastapi": 26, "remot": 26, "app_templ": [26, 27], "config_nam": 26, "templat": [26, 27], "dockerfil": [26, 27], "yml": [26, 27], "fit": 26, "standalon": 27, "simultan": 27, "behind": 27, "revers": 27, "proxi": 27, "traefik": 27, "server": [27, 28], "offici": 27, "guid": 27, "imag": 27, "res_deezy_reldisamb": [27, 28], "root": 27, "container_nam": 27, "cach": 27, "arg": 27, "app_nam": 27, "_imag": 27, "ref": 27, "preprocess": 27, "balanc": 27, "host_url": 27, "your_host_url": 27, "d": [27, 28], "edit": 27, "endpoint": [27, 28], "behaviour": 27, "variabl": 27, "servic": 27, "your_config_nam": 27, "expos": [27, 28], "loadbalanc": 27, "port": [27, 28], "router": 27, "_router": 27, "host": 27, "pathprefix": 27, "v2": [27, 28], "res_": 27, "middlewar": 27, "stripprefix": 27, "rwop": 27, "uvicorn": 27, "header": 27, "8000": 28, "interact": 28, "swagger": 28, "doc": 28, "184": 28, "45": 28, "h": 28, "harvei": 28, "elizabeth": 28, "barnett": 28, "q18125": 28, "api_usag": 28, "variou": 28}, "objects": {"t_res.geoparser.linking": [[7, 0, 1, "", "Linker"], [7, 2, 1, "", "RANDOM_SEED"]], "t_res.geoparser.linking.Linker": [[7, 1, 1, "", "by_distance"], [7, 1, 1, "", "load_resources"], [7, 1, 1, "", "most_popular"], [7, 1, 1, "", "run"], [7, 1, 1, "", "train_load_model"]], "t_res.geoparser.pipeline": [[8, 0, 1, "", "Pipeline"]], "t_res.geoparser.pipeline.Pipeline": [[8, 1, 1, "", "format_prediction"], [8, 1, 1, "", "run_candidate_selection"], [8, 1, 1, "", "run_disambiguation"], [8, 1, 1, "", "run_sentence"], [8, 1, 1, "", "run_sentence_recognition"], [8, 1, 1, "", "run_text"], [8, 1, 1, "", "run_text_recognition"]], "t_res.geoparser.ranking": [[9, 0, 1, "", "Ranker"]], "t_res.geoparser.ranking.Ranker": [[9, 1, 1, "", "check_if_contained"], [9, 1, 1, "", "damlev_dist"], [9, 1, 1, "", "deezy_on_the_fly"], [9, 1, 1, "", "find_candidates"], [9, 1, 1, "", "load_resources"], [9, 1, 1, "", "partial_match"], [9, 1, 1, "", "perfect_match"], [9, 1, 1, "", "run"], [9, 1, 1, "", "train"]], "t_res.geoparser.recogniser": [[10, 0, 1, "", "Recogniser"]], "t_res.geoparser.recogniser.Recogniser": [[10, 1, 1, "", "create_pipeline"], [10, 1, 1, "", "ner_predict"], [10, 1, 1, "", "train"]], "t_res.utils.REL.entity_disambiguation": [[19, 0, 1, "", "EntityDisambiguation"], [19, 2, 1, "", "RANDOM_SEED"]], "t_res.utils.REL.entity_disambiguation.EntityDisambiguation": [[19, 1, 1, "", "get_data_items"], [19, 1, 1, "", "normalize_scores"], [19, 1, 1, "", "predict"], [19, 1, 1, "", "prerank"], [19, 1, 1, "", "train"], [19, 1, 1, "", "train_LR"]], "t_res.utils.REL.mulrel_ranker": [[21, 0, 1, "", "MulRelRanker"], [21, 0, 1, "", "PreRank"]], "t_res.utils.REL.mulrel_ranker.MulRelRanker": [[21, 1, 1, "", "forward"], [21, 1, 1, "", "loss"], [21, 1, 1, "", "regularize"], [21, 2, 1, "", "training"]], "t_res.utils.REL.mulrel_ranker.PreRank": [[21, 1, 1, "", "forward"], [21, 2, 1, "", "training"]], "t_res.utils.REL.vocabulary": [[23, 0, 1, "", "Vocabulary"]], "t_res.utils.REL.vocabulary.Vocabulary": [[23, 1, 1, "", "add_to_vocab"], [23, 1, 1, "", "get_id"], [23, 1, 1, "", "normalize"], [23, 1, 1, "", "size"], [23, 2, 1, "", "unk_token"]], "t_res.utils.deezy_processing": [[12, 3, 1, "", "create_training_set"], [12, 3, 1, "", "generate_candidates"], [12, 3, 1, "", "obtain_matches"], [12, 3, 1, "", "train_deezy_model"]], "t_res.utils.get_data": [[13, 3, 1, "", "download_hipe_data"], [13, 3, 1, "", "download_lwm_data"]], "t_res.utils.ner": [[15, 3, 1, "", "aggregate_entities"], [15, 3, 1, "", "aggregate_mentions"], [15, 3, 1, "", "collect_named_entities"], [15, 3, 1, "", "fix_capitalization"], [15, 3, 1, "", "fix_hyphens"], [15, 3, 1, "", "fix_nested"], [15, 3, 1, "", "fix_startEntity"], [15, 3, 1, "", "training_tokenize_and_align_labels"]], "t_res.utils": [[16, 4, 0, "-", "preprocess_data"]], "t_res.utils.preprocess_data": [[16, 3, 1, "", "aggregate_hipe_entities"], [16, 3, 1, "", "fine_to_coarse"], [16, 3, 1, "", "process_hipe_for_linking"], [16, 3, 1, "", "process_lwm_for_linking"], [16, 3, 1, "", "process_lwm_for_ner"], [16, 3, 1, "", "process_tsv"], [16, 3, 1, "", "reconstruct_sentences"], [16, 3, 1, "", "turn_wikipedia2wikidata"]], "t_res.utils.process_data": [[17, 3, 1, "", "align_gold"], [17, 3, 1, "", "eval_with_exception"], [17, 3, 1, "", "ner_and_process"], [17, 3, 1, "", "postprocess_predictions"], [17, 3, 1, "", "prepare_sents"], [17, 3, 1, "", "prepare_storing_links"], [17, 3, 1, "", "store_for_scorer"], [17, 3, 1, "", "update_with_linking"], [17, 3, 1, "", "update_with_skyline"]], "t_res.utils.process_wikipedia": [[18, 3, 1, "", "make_wikilinks_consistent"], [18, 3, 1, "", "make_wikipedia2wikidata_consisent"], [18, 3, 1, "", "title_to_id"]], "t_res.utils.rel_e2e": [[24, 3, 1, "", "get_rel_from_api"], [24, 3, 1, "", "match_ent"], [24, 3, 1, "", "match_wikipedia_to_wikidata"], [24, 3, 1, "", "postprocess_rel"], [24, 3, 1, "", "rel_end_to_end"], [24, 3, 1, "", "run_rel_experiments"], [24, 3, 1, "", "store_rel"]], "t_res.utils.rel_utils": [[25, 3, 1, "", "add_publication"], [25, 3, 1, "", "eval_with_exception"], [25, 3, 1, "", "get_db_emb"], [25, 3, 1, "", "prepare_initial_data"], [25, 3, 1, "", "prepare_rel_trainset"], [25, 3, 1, "", "rank_candidates"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:attribute", "3": "py:function", "4": "py:module"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "function", "Python function"], "4": ["py", "module", "Python module"]}, "titleterms": {"experi": 0, "evalu": 0, "1": [0, 1, 4, 27], "obtain": 0, "extern": 0, "resourc": [0, 1, 4], "2": [0, 1, 4, 27], "prepar": 0, "data": [0, 4], "3": [0, 1, 27], "run": 0, "4": [0, 1, 27], "The": 1, "complet": 1, "tour": 1, "pipelin": [1, 5, 8], "instanti": 1, "us": [1, 3, 28], "end": 1, "step": 1, "descript": 1, "output": 1, "recommend": 1, "recognis": [1, 10], "train": [1, 4], "ner": [1, 15], "model": 1, "ranker": [1, 9], "perfectmatch": 1, "partialmatch": 1, "levenshtein": 1, "deezymatch": [1, 4], "option": 1, "from": 1, "scratch": 1, "given": 1, "an": 1, "exist": 1, "string": 1, "pair": 1, "dataset": [1, 4], "includ": 1, "gener": 1, "load": 1, "retriev": 1, "candid": 1, "mention": 1, "linker": [1, 7], "mostpopular": 1, "reldisamb": 1, "entiti": [1, 4], "disambigu": [1, 4], "get": 2, "start": 2, "tabl": [2, 5, 6, 11, 14, 20, 26], "content": [2, 5, 6, 11, 14, 20, 26], "instal": 3, "t": [3, 5, 26, 27, 28], "re": [3, 5, 26, 27, 28], "updat": 3, "system": 3, "pyenv": 3, "poetri": 3, "project": 3, "how": 3, "pre": 3, "commit": 3, "hoook": 3, "directori": 4, "structur": 4, "toponym": [4, 5], "recognit": 4, "wikipedia": 4, "wikidata": 4, "base": 4, "mentions_to_wikidata": 4, "json": 4, "mentions_to_wikidata_norm": 4, "wikidata_to_mentions_norm": 4, "wikidata_gazett": 4, "csv": 4, "entity2class": 4, "txt": 4, "word": 4, "embed": 4, "set": 4, "word2vec": 4, "noisi": 4, "summari": 4, "A": 5, "resolut": 5, "digitis": 5, "histor": 5, "newspap": 5, "indic": 5, "geopars": [6, 7, 8, 9, 10], "modul": [6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], "t_re": [7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25], "link": 7, "rank": 9, "refer": 11, "util": [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], "deezy_process": 12, "get_data": 13, "preprocess_data": 16, "process_data": 17, "process_wikipedia": 18, "rel": [19, 20, 21, 22, 23], "entity_disambigu": 19, "mulrel_rank": 21, "vocabulari": 23, "rel_e2": 24, "rel_util": 25, "deploi": [26, 27], "api": [26, 27, 28], "build": 27, "contain": 27, "multipl": 27, "via": 27, "docker": 27, "compos": 27, "configur": 27, "your": 27, "deploy": 27}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx": 57}, "alltitles": {"Experiments and evaluation": [[0, "experiments-and-evaluation"]], "1. Obtain the external resources": [[0, "obtain-the-external-resources"]], "2. Preparing the data": [[0, "preparing-the-data"]], "3. Running the experiments": [[0, "running-the-experiments"]], "4. Evaluate": [[0, "evaluate"]], "The complete tour": [[1, "the-complete-tour"]], "The Pipeline": [[1, "the-pipeline"]], "1. Instantiate the Pipeline": [[1, "instantiate-the-pipeline"]], "2. Use the Pipeline": [[1, "use-the-pipeline"]], "End-to-end pipeline": [[1, "end-to-end-pipeline"]], "Step-by-step pipeline": [[1, "step-by-step-pipeline"]], "Description of the output": [[1, "description-of-the-output"]], "Pipeline recommendations": [[1, "pipeline-recommendations"]], "The Recogniser": [[1, "the-recogniser"]], "1. Instantiate the Recogniser": [[1, "instantiate-the-recogniser"]], "2. Train the NER model": [[1, "train-the-ner-model"]], "The Ranker": [[1, "the-ranker"]], "1. Instantiate the Ranker": [[1, "instantiate-the-ranker"]], "1.1. Perfectmatch, partialmatch, and levenshtein": [[1, "perfectmatch-partialmatch-and-levenshtein"]], "1.2. DeezyMatch": [[1, "id4"]], "Option 1. Train a DeezyMatch model from scratch, given an existing string pairs dataset": [[1, "option-1-train-a-deezymatch-model-from-scratch-given-an-existing-string-pairs-dataset"]], "Option 2. Train a DeezyMatch model from scratch, including generating a string pairs dataset": [[1, "option-2-train-a-deezymatch-model-from-scratch-including-generating-a-string-pairs-dataset"]], "2. Load the resources": [[1, "load-the-resources"], [1, "id9"]], "3. Train a DeezyMatch model": [[1, "train-a-deezymatch-model"]], "4. Retrieve candidates for a given mention": [[1, "retrieve-candidates-for-a-given-mention"]], "The Linker": [[1, "the-linker"]], "1. Instantiate the Linker": [[1, "instantiate-the-linker"]], "1.1. mostpopular": [[1, "mostpopular"]], "1.2. reldisamb": [[1, "reldisamb"]], "3. Train an entity disambiguation model": [[1, "train-an-entity-disambiguation-model"]], "Getting started": [[2, "getting-started"]], "Table of contents:": [[2, null], [5, null], [6, null], [11, null], [14, null], [20, null], [26, null]], "Installing T-Res": [[3, "installing-t-res"]], "Update the system": [[3, "update-the-system"]], "Install pyenv": [[3, "install-pyenv"]], "Install poetry": [[3, "install-poetry"]], "Project Installation": [[3, "project-installation"]], "How to use poetry": [[3, "how-to-use-poetry"]], "Pre-commit hoooks": [[3, "pre-commit-hoooks"]], "Resources and directory structure": [[4, "resources-and-directory-structure"]], "Toponym recognition and disambiguation training data": [[4, "toponym-recognition-and-disambiguation-training-data"]], "1. Toponym recognition dataset": [[4, "toponym-recognition-dataset"]], "2. Toponym disambiguation dataset": [[4, "toponym-disambiguation-dataset"]], "Wikipedia- and Wikidata-based resources": [[4, "wikipedia-and-wikidata-based-resources"]], "mentions_to_wikidata.json": [[4, "mentions-to-wikidata-json"]], "mentions_to_wikidata_normalized.json": [[4, "mentions-to-wikidata-normalized-json"]], "wikidata_to_mentions_normalized.json": [[4, "wikidata-to-mentions-normalized-json"]], "wikidata_gazetteer.csv": [[4, "wikidata-gazetteer-csv"]], "entity2class.txt": [[4, "entity2class-txt"]], "Entity and word embeddings": [[4, "entity-and-word-embeddings"]], "DeezyMatch training set": [[4, "deezymatch-training-set"]], "1. DeezyMatch training set": [[4, "id6"]], "2. Word2Vec embeddings trained on noisy data": [[4, "word2vec-embeddings-trained-on-noisy-data"]], "Summary of resources and directory structure": [[4, "summary-of-resources-and-directory-structure"]], "T-Res: A Toponym Resolution Pipeline for Digitised Historical Newspapers": [[5, "t-res-a-toponym-resolution-pipeline-for-digitised-historical-newspapers"]], "Indices and tables": [[5, "indices-and-tables"]], "geoparser module": [[6, "geoparser-module"]], "t_res.geoparser.linking.Linker": [[7, "t-res-geoparser-linking-linker"]], "t_res.geoparser.pipeline.Pipeline": [[8, "t-res-geoparser-pipeline-pipeline"]], "t_res.geoparser.ranking. Ranker": [[9, "t-res-geoparser-ranking-ranker"]], "t_res.geoparser.recogniser.Recogniser": [[10, "t-res-geoparser-recogniser-recogniser"]], "Reference": [[11, "reference"]], "t_res.utils.deezy_processing module": [[12, "t-res-utils-deezy-processing-module"]], "t_res.utils.get_data module": [[13, "t-res-utils-get-data-module"]], "utils module": [[14, "utils-module"]], "t_res.utils.ner module": [[15, "t-res-utils-ner-module"]], "t_res.utils.preprocess_data module": [[16, "t-res-utils-preprocess-data-module"]], "t_res.utils.process_data module": [[17, "t-res-utils-process-data-module"]], "t_res.utils.process_wikipedia module": [[18, "t-res-utils-process-wikipedia-module"]], "t_res.utils.REL.entity_disambiguation module": [[19, "t-res-utils-rel-entity-disambiguation-module"]], "utils.REL module": [[20, "utils-rel-module"]], "t_res.utils.REL.mulrel_ranker module": [[21, "t-res-utils-rel-mulrel-ranker-module"]], "t_res.utils.REL.t_res.utils module": [[22, "t-res-utils-rel-t-res-utils-module"]], "t_res.utils.REL.vocabulary module": [[23, "t-res-utils-rel-vocabulary-module"]], "t_res.utils.rel_e2e module": [[24, "t-res-utils-rel-e2e-module"]], "t_res.utils.rel_utils module": [[25, "t-res-utils-rel-utils-module"]], "Deploying the T-Res API": [[26, "deploying-the-t-res-api"], [27, "deploying-the-t-res-api"]], "1. Building the container": [[27, "building-the-container"]], "2. Deploying the container": [[27, "deploying-the-container"]], "3. Deploying multiple containers via Docker Compose": [[27, "deploying-multiple-containers-via-docker-compose"]], "4. Configuring your deployment": [[27, "configuring-your-deployment"]], "Using the T-Res API": [[28, "using-the-t-res-api"]]}, "indexentries": {"linker (class in t_res.geoparser.linking)": [[7, "t_res.geoparser.linking.Linker"]], "random_seed (t_res.geoparser.linking attribute)": [[7, "t_res.geoparser.linking.RANDOM_SEED"]], "by_distance() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.by_distance"]], "load_resources() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.load_resources"]], "most_popular() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.most_popular"]], "run() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.run"]], "train_load_model() (t_res.geoparser.linking.linker method)": [[7, "t_res.geoparser.linking.Linker.train_load_model"]], "pipeline (class in t_res.geoparser.pipeline)": [[8, "t_res.geoparser.pipeline.Pipeline"]], "format_prediction() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.format_prediction"]], "run_candidate_selection() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_candidate_selection"]], "run_disambiguation() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_disambiguation"]], "run_sentence() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_sentence"]], "run_sentence_recognition() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_sentence_recognition"]], "run_text() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_text"]], "run_text_recognition() (t_res.geoparser.pipeline.pipeline method)": [[8, "t_res.geoparser.pipeline.Pipeline.run_text_recognition"]], "ranker (class in t_res.geoparser.ranking)": [[9, "t_res.geoparser.ranking.Ranker"]], "check_if_contained() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.check_if_contained"]], "damlev_dist() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.damlev_dist"]], "deezy_on_the_fly() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.deezy_on_the_fly"]], "find_candidates() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.find_candidates"]], "load_resources() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.load_resources"]], "partial_match() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.partial_match"]], "perfect_match() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.perfect_match"]], "run() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.run"]], "train() (t_res.geoparser.ranking.ranker method)": [[9, "t_res.geoparser.ranking.Ranker.train"]], "recogniser (class in t_res.geoparser.recogniser)": [[10, "t_res.geoparser.recogniser.Recogniser"]], "create_pipeline() (t_res.geoparser.recogniser.recogniser method)": [[10, "t_res.geoparser.recogniser.Recogniser.create_pipeline"]], "ner_predict() (t_res.geoparser.recogniser.recogniser method)": [[10, "t_res.geoparser.recogniser.Recogniser.ner_predict"]], "train() (t_res.geoparser.recogniser.recogniser method)": [[10, "t_res.geoparser.recogniser.Recogniser.train"]], "create_training_set() (in module t_res.utils.deezy_processing)": [[12, "t_res.utils.deezy_processing.create_training_set"]], "generate_candidates() (in module t_res.utils.deezy_processing)": [[12, "t_res.utils.deezy_processing.generate_candidates"]], "obtain_matches() (in module t_res.utils.deezy_processing)": [[12, "t_res.utils.deezy_processing.obtain_matches"]], "train_deezy_model() (in module t_res.utils.deezy_processing)": [[12, "t_res.utils.deezy_processing.train_deezy_model"]], "download_hipe_data() (in module t_res.utils.get_data)": [[13, "t_res.utils.get_data.download_hipe_data"]], "download_lwm_data() (in module t_res.utils.get_data)": [[13, "t_res.utils.get_data.download_lwm_data"]], "aggregate_entities() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.aggregate_entities"]], "aggregate_mentions() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.aggregate_mentions"]], "collect_named_entities() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.collect_named_entities"]], "fix_capitalization() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.fix_capitalization"]], "fix_hyphens() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.fix_hyphens"]], "fix_nested() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.fix_nested"]], "fix_startentity() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.fix_startEntity"]], "training_tokenize_and_align_labels() (in module t_res.utils.ner)": [[15, "t_res.utils.ner.training_tokenize_and_align_labels"]], "aggregate_hipe_entities() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.aggregate_hipe_entities"]], "fine_to_coarse() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.fine_to_coarse"]], "module": [[16, "module-t_res.utils.preprocess_data"]], "process_hipe_for_linking() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.process_hipe_for_linking"]], "process_lwm_for_linking() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.process_lwm_for_linking"]], "process_lwm_for_ner() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.process_lwm_for_ner"]], "process_tsv() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.process_tsv"]], "reconstruct_sentences() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.reconstruct_sentences"]], "t_res.utils.preprocess_data": [[16, "module-t_res.utils.preprocess_data"]], "turn_wikipedia2wikidata() (in module t_res.utils.preprocess_data)": [[16, "t_res.utils.preprocess_data.turn_wikipedia2wikidata"]], "align_gold() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.align_gold"]], "eval_with_exception() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.eval_with_exception"]], "ner_and_process() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.ner_and_process"]], "postprocess_predictions() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.postprocess_predictions"]], "prepare_sents() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.prepare_sents"]], "prepare_storing_links() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.prepare_storing_links"]], "store_for_scorer() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.store_for_scorer"]], "update_with_linking() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.update_with_linking"]], "update_with_skyline() (in module t_res.utils.process_data)": [[17, "t_res.utils.process_data.update_with_skyline"]], "make_wikilinks_consistent() (in module t_res.utils.process_wikipedia)": [[18, "t_res.utils.process_wikipedia.make_wikilinks_consistent"]], "make_wikipedia2wikidata_consisent() (in module t_res.utils.process_wikipedia)": [[18, "t_res.utils.process_wikipedia.make_wikipedia2wikidata_consisent"]], "title_to_id() (in module t_res.utils.process_wikipedia)": [[18, "t_res.utils.process_wikipedia.title_to_id"]], "entitydisambiguation (class in t_res.utils.rel.entity_disambiguation)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation"]], "random_seed (t_res.utils.rel.entity_disambiguation attribute)": [[19, "t_res.utils.REL.entity_disambiguation.RANDOM_SEED"]], "get_data_items() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.get_data_items"]], "normalize_scores() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.normalize_scores"]], "predict() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.predict"]], "prerank() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.prerank"]], "train() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.train"]], "train_lr() (t_res.utils.rel.entity_disambiguation.entitydisambiguation method)": [[19, "t_res.utils.REL.entity_disambiguation.EntityDisambiguation.train_LR"]], "mulrelranker (class in t_res.utils.rel.mulrel_ranker)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker"]], "prerank (class in t_res.utils.rel.mulrel_ranker)": [[21, "t_res.utils.REL.mulrel_ranker.PreRank"]], "forward() (t_res.utils.rel.mulrel_ranker.mulrelranker method)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker.forward"]], "forward() (t_res.utils.rel.mulrel_ranker.prerank method)": [[21, "t_res.utils.REL.mulrel_ranker.PreRank.forward"]], "loss() (t_res.utils.rel.mulrel_ranker.mulrelranker method)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker.loss"]], "regularize() (t_res.utils.rel.mulrel_ranker.mulrelranker method)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker.regularize"]], "training (t_res.utils.rel.mulrel_ranker.mulrelranker attribute)": [[21, "t_res.utils.REL.mulrel_ranker.MulRelRanker.training"]], "training (t_res.utils.rel.mulrel_ranker.prerank attribute)": [[21, "t_res.utils.REL.mulrel_ranker.PreRank.training"]], "vocabulary (class in t_res.utils.rel.vocabulary)": [[23, "t_res.utils.REL.vocabulary.Vocabulary"]], "add_to_vocab() (t_res.utils.rel.vocabulary.vocabulary method)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.add_to_vocab"]], "get_id() (t_res.utils.rel.vocabulary.vocabulary method)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.get_id"]], "normalize() (t_res.utils.rel.vocabulary.vocabulary static method)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.normalize"]], "size() (t_res.utils.rel.vocabulary.vocabulary method)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.size"]], "unk_token (t_res.utils.rel.vocabulary.vocabulary attribute)": [[23, "t_res.utils.REL.vocabulary.Vocabulary.unk_token"]], "get_rel_from_api() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.get_rel_from_api"]], "match_ent() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.match_ent"]], "match_wikipedia_to_wikidata() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.match_wikipedia_to_wikidata"]], "postprocess_rel() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.postprocess_rel"]], "rel_end_to_end() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.rel_end_to_end"]], "run_rel_experiments() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.run_rel_experiments"]], "store_rel() (in module t_res.utils.rel_e2e)": [[24, "t_res.utils.rel_e2e.store_rel"]], "add_publication() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.add_publication"]], "eval_with_exception() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.eval_with_exception"]], "get_db_emb() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.get_db_emb"]], "prepare_initial_data() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.prepare_initial_data"]], "prepare_rel_trainset() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.prepare_rel_trainset"]], "rank_candidates() (in module t_res.utils.rel_utils)": [[25, "t_res.utils.rel_utils.rank_candidates"]]}})
\ No newline at end of file