diff --git a/experiments/experiment.py b/experiments/experiment.py index 2e758586..00e362b4 100644 --- a/experiments/experiment.py +++ b/experiments/experiment.py @@ -743,7 +743,10 @@ def linking_experiments(self) -> None: self.mylinker.linking_resources["mentions_to_wikidata"], ) if self.mylinker.rel_params["with_publication"]: - # If "publ", add an artificial publication entry: + # If "publ", add publication info to context and as new entry: + article_dataset = rel_utils.add_publication_in_context( + article_dataset + ) article_dataset = rel_utils.add_publication(article_dataset) predicted = linking_model.predict(article_dataset) if self.mylinker.rel_params["with_publication"]: diff --git a/geoparser/pipeline.py b/geoparser/pipeline.py index 4c661214..68601fd3 100644 --- a/geoparser/pipeline.py +++ b/geoparser/pipeline.py @@ -257,7 +257,15 @@ def run_sentence( mentions_dataset = dict() mentions_dataset["linking"] = [] for m in mentions: - prediction = self.format_prediction(m, sentence, wk_cands=wk_cands, context=context, sent_idx=sent_idx, place=place, place_wqid=place_wqid) + prediction = self.format_prediction( + m, + sentence, + wk_cands=wk_cands, + context=context, + sent_idx=sent_idx, + place=place, + place_wqid=place_wqid, + ) mentions_dataset["linking"].append(prediction) # If the linking method is "reldisamb", rank and format candidates, @@ -274,6 +282,12 @@ def run_sentence( place_wqid = self.mylinker.rel_params["default_publwqid"] place = self.mylinker.rel_params["default_publname"] + # If "publ", add the place of publication to the context: + mentions_dataset = rel_utils.add_publication_in_context( + mentions_dataset, + place, + ) + # If "publ", add an artificial publication entry: mentions_dataset = rel_utils.add_publication( mentions_dataset, @@ -508,11 +522,7 @@ def run_text( return document_dataset - - def run_sentence_recognition( - self, - sentence - ) -> List[dict]: + def run_sentence_recognition(self, sentence) -> List[dict]: # Get predictions: predictions = self.myner.ner_predict(sentence) @@ -525,15 +535,16 @@ def run_sentence_recognition( # Aggregate mentions: mentions = ner.aggregate_mentions(procpreds, "pred") return mentions - - - def format_prediction(self, mention, - sentence: str, - wk_cands: Optional[dict] = None, - context: Optional[Tuple[str, str]] = ("", ""), - sent_idx: Optional[int] = 0, - place: Optional[str] = "", - place_wqid: Optional[str] = "" + + def format_prediction( + self, + mention, + sentence: str, + wk_cands: Optional[dict] = None, + context: Optional[Tuple[str, str]] = ("", ""), + sent_idx: Optional[int] = 0, + place: Optional[str] = "", + place_wqid: Optional[str] = "", ) -> dict: prediction = dict() prediction["mention"] = mention["mention"] @@ -551,12 +562,12 @@ def format_prediction(self, mention, prediction["place"] = place prediction["place_wqid"] = place_wqid if wk_cands: - prediction["string_match_candidates"] = wk_cands.get(mention["mention"], dict()) + prediction["string_match_candidates"] = wk_cands.get( + mention["mention"], dict() + ) prediction["candidates"] = wk_cands.get(mention["mention"], dict()) return prediction - - def run_text_recognition( self, text: str, @@ -627,7 +638,15 @@ def run_text_recognition( mentions_dataset = [] for m in mentions: - prediction = self.format_prediction(m, sentence, wk_cands=None, context=context, sent_idx=idx, place=place, place_wqid=place_wqid) + prediction = self.format_prediction( + m, + sentence, + wk_cands=None, + context=context, + sent_idx=idx, + place=place, + place_wqid=place_wqid, + ) # mentions_dataset["linking"].append(prediction) if not len(m["mention"]) == 1 and not m["mention"].islower(): mentions_dataset.append(prediction) @@ -777,6 +796,12 @@ def run_disambiguation( place_wqid = self.mylinker.rel_params["default_publwqid"] place = self.mylinker.rel_params["default_publname"] + # If "publ", add the place of publication to the context: + mentions_dataset = rel_utils.add_publication_in_context( + mentions_dataset, + place, + ) + # If "publ", add an artificial publication entry: mentions_dataset = rel_utils.add_publication( mentions_dataset, diff --git a/resources/publication_metadata.json b/resources/publication_metadata.json index a9daace2..0993c9f1 100644 --- a/resources/publication_metadata.json +++ b/resources/publication_metadata.json @@ -1,162 +1,139 @@ { "sn83030483": { "publication_title": "Gazette of the United-States", - "publication_place": "New York", - "publication_ctxt": "New York", + "publication_place": "New York, New York", "publication_dates": "1789-1793", "wikidata_qid": "Q60" }, "sn84026272": { "publication_title": "Gazette of the United-States", - "publication_place": "Philadelphia", - "publication_ctxt": "Pennsylvania", + "publication_place": "Philadelphia, Pennsylvania", "publication_dates": "1800-1801", "wikidata_qid": "Q1345" }, "sn82014385": { "publication_title": "The Delaware gazette", - "publication_place": "Wilmington", - "publication_ctxt": "Delaware", + "publication_place": "Wilmington, Delaware", "publication_dates": "1809-1810", "wikidata_qid": "Q174224" }, "sn83026170": { "publication_title": "Alexandria Gazette", - "publication_place": "Alexandria", - "publication_ctxt": "Virginia", + "publication_place": "Alexandria, Virginia", "publication_dates": "1817-1822", "wikidata_qid": "Q88" }, "sn83020874": { "publication_title": "Cherokee Phoenix, and Indian's advocate", - "publication_place": "Echota", - "publication_ctxt": "Georgia", + "publication_place": "Echota, Georgia", "publication_dates": "1829-1834", "wikidata_qid": "Q7007061" }, "sn84020750": { "publication_title": "The North Carolinian", - "publication_place": "Fayetteville", - "publication_ctxt": "North Carolina", + "publication_place": "Fayetteville, North Carolina", "publication_dates": "1839-1861", "wikidata_qid": "Q331104" }, "sn85042404": { "publication_title": "Jamestown Alert", - "publication_place": "Jamestown", - "publication_ctxt": "North Dakota", + "publication_place": "Jamestown, North Dakota", "publication_dates": "1878-1882", "wikidata_qid": "Q1052658" }, "sn88068010": { "publication_title": "Chariton Courier", - "publication_place": "Keytesville", - "publication_ctxt": "Missouri", + "publication_place": "Keytesville, Missouri", "publication_dates": "1878-current", "wikidata_qid": "Q957297" }, "sn86063397": { "publication_title": "The Elk Mountain pilot", - "publication_place": "Irwin", - "publication_ctxt": "Colorado", + "publication_place": "Irwin, Colorado", "publication_dates": "1880-19??", "wikidata_qid": "Q592729" }, "sn88085488": { "publication_title": "Pullman Herald", - "publication_place": "Pullman", - "publication_ctxt": "Washington", + "publication_place": "Pullman, Washington", "publication_dates": "1888-1989", "wikidata_qid": "Q983540" }, "sn89058133": { "publication_title": "Putnam County Herald", - "publication_place": "Cookeville", - "publication_ctxt": "Tennessee", + "publication_place": "Cookeville, Tennessee", "publication_dates": "1903-1922", "wikidata_qid": "Q2456192" }, "sn83025812": { "publication_title": "The Independent", - "publication_place": "Elizabeth City", - "publication_ctxt": "North Carolina", + "publication_place": "Elizabeth City, North Carolina", "publication_dates": "1908-1936", "wikidata_qid": "Q1018467" }, "sn92063852": { "publication_title": "The Detroit Tribune", - "publication_place": "Detroit", - "publication_ctxt": "Michigan", + "publication_place": "Detroit, Michigan", "publication_dates": "1935-1966", "wikidata_qid": "Q12439" }, "sn91068761": { "publication_title": "Tabor City Tribune", - "publication_place": "Tabor City", - "publication_ctxt": "North Carolina", + "publication_place": "Tabor City, North Carolina", "publication_dates": "1946-1991", "wikidata_qid": "Q586130" }, "0000408": { "publication_title": "Dorset County Chronicle", - "publication_place": "Dorchester", - "publication_ctxt": "Dorset", + "publication_place": "Dorchester, Dorset", "publication_dates": "1824-1884", "wikidata_qid": "Q503331" }, "0000206": { "publication_title": "Manchester Courier and Lancashire General Advertiser.", - "publication_place": "Manchester", - "publication_ctxt": "Lancashire", + "publication_place": "Manchester, Lancashire", "publication_dates": "1825-1916", "wikidata_qid": "Q18125" }, "0000968": { "publication_title": "The Ashton Weekly Reporter, and Stalybridge and Dukinfield Chronicle", - "publication_place": "Ashton-under-Lyne", - "publication_ctxt": "Lancashire", + "publication_place": "Ashton-under-Lyne, Lancashire", "publication_dates": "1855-", "wikidata_qid": "Q659803" }, "0000200": { "publication_title": "The Manchester Mercury", - "publication_place": "Manchester", - "publication_ctxt": "Lancashire", + "publication_place": "Manchester, Lancashire", "publication_dates": "1752-1830", "wikidata_qid": "Q18125" }, "0000201": { "publication_title": "The Manchester Mercury", - "publication_place": "Manchester", - "publication_ctxt": "Lancashire", + "publication_place": "Manchester, Lancashire", "publication_dates": "1752-1830", "wikidata_qid": "Q18125" }, "0000239": { "publication_title": "The Manchester Mercury", - "publication_place": "Manchester", - "publication_ctxt": "Lancashire", + "publication_place": "Manchester, Lancashire", "publication_dates": "1752-1830", "wikidata_qid": "Q18125" }, "0000240": { "publication_title": "The Manchester Mercury", - "publication_place": "Manchester", - "publication_ctxt": "Lancashire", + "publication_place": "Manchester, Lancashire", "publication_dates": "1752-1830", "wikidata_qid": "Q18125" }, "0000967": { "publication_title": "Ashton and Stalybridge Reporter", - "publication_place": "Ashton-under-Lyne", - "publication_ctxt": "Lancashire", + "publication_place": "Ashton-under-Lyne, Lancashire", "publication_dates": "1855-", "wikidata_qid": "Q659803" }, "0002325": { "publication_title": "The Poole and South-Western Herald", - "publication_place": "Poole", - "publication_ctxt": "Dorset", + "publication_place": "Poole, Dorset", "publication_dates": "1852-1889", "wikidata_qid": "Q203349" } diff --git a/utils/rel_utils.py b/utils/rel_utils.py index 7589924f..fd98840c 100644 --- a/utils/rel_utils.py +++ b/utils/rel_utils.py @@ -275,6 +275,32 @@ def add_publication( return new_json +def add_publication_in_context(rel_json: dict, publname: Optional[str] = "") -> dict: + """ + Add publication information to the provided JSON data as context. + + Arguments: + rel_json (dict): The JSON data containing articles and mention + information. + publname (str, optional): The name of the publication. Defaults to an + empty string. + + Returns: + dict: A new JSON dictionary with the added publication information. + """ + new_json = rel_json.copy() + for article in rel_json: + place = publname + if article != "linking": + place = rel_json[article][0].get("place", publname) + new_article = [] + for art_mention in rel_json[article]: + art_mention["context"][1] += " " + place + new_article.append(art_mention) + new_json[article] = new_article + return new_json + + def prepare_rel_trainset( df: pd.DataFrame, rel_params, @@ -333,6 +359,10 @@ def prepare_rel_trainset( # If "publ" is taken into account for the disambiguation, add the place # of publication as an additional already disambiguated entity per row: if rel_params["with_publication"] == True: + rel_json = add_publication_in_context( + rel_json, + rel_params["default_publname"], + ) rel_json = add_publication( rel_json, rel_params["default_publname"],