From 19726c4c06281e52fb31fa68feb08a695bf85b1c Mon Sep 17 00:00:00 2001 From: giyaseddin Date: Sun, 10 Apr 2022 13:04:46 +0300 Subject: [PATCH 1/6] Add MedQuAD dataset loader --- biodatasets/medquad/medquad.py | 264 +++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 biodatasets/medquad/medquad.py diff --git a/biodatasets/medquad/medquad.py b/biodatasets/medquad/medquad.py new file mode 100644 index 00000000..8b8d20a5 --- /dev/null +++ b/biodatasets/medquad/medquad.py @@ -0,0 +1,264 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This is an implementation of the dataloader for MedQuAD dataset. +MedQuAD includes 47,457 medical question-answer pairs created from 12 NIH websites (e.g. cancer.gov, niddk.nih.gov, GARD, MedlinePlus Health Topics). The collection covers 37 question types (e.g. Treatment, Diagnosis, Side Effects) associated with diseases, drugs and other medical entities such as tests. +We included additional annotations in the XML files, that could be used for diverse IR and NLP tasks, such as the question type, the question focus, its syonyms, its UMLS Concept Unique Identifier (CUI) and Semantic Type. +We added the category of the question focus (Disease, Drug or Other) in the 4 MedlinePlus collections. All other collections are about diseases. +The paper cited below describes the collection, the construction method as well as its use and evaluation within a medical question answering system. +N.B. We removed the answers from 3 subsets to respect the MedlinePlus copyright (https://medlineplus.gov/copyright.html): +(1) A.D.A.M. Medical Encyclopedia, (2) MedlinePlus Drug information, and (3) MedlinePlus Herbal medicine and supplement information. +-- We kept all the other information including the URLs in case you want to crawl the answers. Please contact me if you have any questions. + +For more info please visit https://github.com/abachaa/MedQuAD/ +""" +import json +import os +import xml.etree.ElementTree as ET +from typing import List, Tuple, Dict + +import datasets +import requests + +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_CITATION = """\ +@article{BenAbacha-BMC-2019, + author = {Asma {Ben Abacha} and Dina Demner{-}Fushman}, + title = {A Question-Entailment Approach to Question Answering}, + journal = {{BMC} Bioinform.}, + volume = {20}, + pages = {511:1--511:23}, + year = {2019}, + url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3119-4}, + doi = {https://doi.org/10.1186/s12859-019-3119-4}, + biburl = {https://citation-needed.springer.com/v2/references/10.1186/s12859-019-3119-4?format=refman&flavour=citation}, +} +""" + +_DATASETNAME = "medquad" + +_DESCRIPTION = """\ +MedQuAD: Medical Question Answering Dataset +MedQuAD includes 47,457 medical question-answer pairs created from 12 NIH websites\ +(e.g. cancer.gov, niddk.nih.gov, GARD, MedlinePlus Health Topics).\ +The collection covers 37 question types (e.g. Treatment, Diagnosis, Side Effects) associated with diseases,\ +drugs and other medical entities such as tests. +""" + +_HOMEPAGE = "https://github.com/abachaa/MedQuAD" + +_LICENSE = "https://creativecommons.org/licenses/by/4.0/legalcode" # TODO: terms aren't available in the repository! In the issue it is 'CC BY 4.0' + +_DATA_PATH = "https://raw.githubusercontent.com/abachaa/MedQuAD/master" + +_DATA_REPO_FETCH_URL = "https://api.github.com/repos/abachaa/MedQuAD/git/trees/master?recursive=1" + +_SUBSET_BASE_URIS = { + "cancergov_qa": "1_CancerGov_QA", + "gard_qa": "2_GARD_QA", + "ghr_qa": "3_GHR_QA", + "mplus_health_topics_qa": "4_MPlus_Health_Topics_QA", + "niddk_qa": "5_NIDDK_QA", + "ninds_qa": "6_NINDS_QA", + "seniorhealth_qa": "7_SeniorHealth_QA", + "nhlbi_qa_xml": "8_NHLBI_QA_XML", + "cdc_qa": "9_CDC_QA", + "mplus_adam_qa": "10_MPlus_ADAM_QA", + "mplusdrugs_qa": "11_MPlusDrugs_QA", + "mplusherbssupplements_qa": "12_MPlusHerbsSupplements_QA", +} + +_URLS = { + "medquad_base_urls": _SUBSET_BASE_URIS, + f"QATestSetMedQrels_judged_answers": f"{_DATA_PATH}/QA-TestSet-LiveQA-Med-Qrels-2479-Answers.zip", +} + +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] # TODO: shall we add a non-existing task type such as `RQE`? + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + + +class MedquadDataset(datasets.GeneratorBasedBuilder): + """MedQuAD: Medical Question Answering Dataset""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="medquad_source", + version=SOURCE_VERSION, + description="medquad source schema", + schema="source", + subset_id="medquad", + ), + BigBioConfig( + name="medquad_bigbio_qa", + version=BIGBIO_VERSION, + description="medquad BigBio schema", + schema="bigbio_qa", + subset_id="medquad", + ), + ] + + DEFAULT_CONFIG_NAME = "medquad_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "Document": datasets.Value("string"), + "QAPair": datasets.Value("string"), + "qid": datasets.Value("string"), + "qtype": datasets.Value("string"), + "Question": datasets.Value("string"), + "Answer": datasets.Value("string"), + } + ) + + elif self.config.schema == "bigbio_qa": + features = schemas.qa_features + else: + raise NotImplementedError("Only `source` and `bigbio_qa` schemas are implemented.") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _load_qa_from_xml(self, file_paths) -> Tuple[List[dict], str]: + """ + This method traverses the whole list of the downloaded XML files and extracts Q&A pairs. + Returns the extracted Q&As and the base directory of the dumped json file that contains them all. + """ + assert len(file_paths) + + qa_list = [] + for file_path in file_paths: + doc_root = ET.parse(file_path).getroot() + document_id = doc_root.attrib.get("id") + for element in doc_root: + if element.tag == "QAPairs": + qa_pairs = element + for qa_pair in qa_pairs: + # Handled this way in case question & answer order occur differently + question = qa_pair[0] if qa_pair[0].tag == "Question" else qa_pair[1] + answer = qa_pair[1] if qa_pair[1].tag == "Answer" else qa_pair[0] + + qa_list.append({ + "Document": document_id, + "QAPair": qa_pair.attrib.get("pid"), + "qid": question.attrib.get("qid"), + "qtype": question.attrib.get("qtype"), + "Question": question.text, + "Answer": answer.text, + }) + + return qa_list, os.path.dirname(file_paths[0]) + + def _dump_xml_to_json(self, dl_manager) -> str: + """ + This method parses the dataset + + """ + if self.config.subset_id == "medquad": + file_base_urls = _URLS[f"{self.config.subset_id}_base_urls"] + qa_pairs_enriched_fname = f"MedQuADGoldenEnriched/{self.config.subset_id}.json" + repo_files = json.loads(requests.get(_DATA_REPO_FETCH_URL).text) + else: + raise NotImplementedError("Only full set loader is implemented here") + + # Collect path info for all repo paths, and determine relevant XML files + qa_file_paths = [] + for subset_name, uri_ in file_base_urls.items(): + for path in repo_files["tree"]: + if path["type"] == "blob" and path["path"].startswith(uri_) and path["path"].endswith(".xml"): + qa_file_paths.append(os.path.join(_DATA_PATH, path["path"])) + + qa_list, data_dir = self._load_qa_from_xml( + file_paths=dl_manager.download_and_extract(qa_file_paths) + ) + + qa_pairs_enriched_full_path = os.path.join(data_dir, qa_pairs_enriched_fname) + + qa_pairs_enriched_dir = os.path.dirname(qa_pairs_enriched_full_path) + if not os.path.exists(qa_pairs_enriched_dir): + os.mkdir(qa_pairs_enriched_dir) + + if not os.path.exists(qa_pairs_enriched_full_path): + data = {"qa_pairs": qa_list} + # dump QA paris to json + with open(qa_pairs_enriched_full_path, "wt", encoding="utf-8") as file: + json.dump(data, file, indent=2) + + return qa_pairs_enriched_full_path + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + qa_pairs_enriched_fpath = self._dump_xml_to_json(dl_manager) + + # There is no canonical train/valid/test set in this dataset. So, only TRAIN is added. + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": qa_pairs_enriched_fpath, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + with open(filepath, encoding="utf-8") as file: + data = json.load(file) + for key, record in enumerate(data["qa_pairs"]): + yield key, { + "Document": record["Document"], + "QAPair": record["QAPair"], + "qid": record["qid"], + "qtype": record["qtype"], + "Question": record["Question"], + "Answer": record["Answer"], + } + + elif self.config.schema == "bigbio_qa": + with open(filepath, encoding="utf-8") as file: + uid = 0 + data = json.load(file) + for key, record in enumerate(data["qa_pairs"]): + uid += 1 + yield key, { + "id": str(uid), + "document_id": record["Document"], + "question_id": record["qid"], + "question": record["Question"], + "type": record["qtype"], + "choices": [], + "context": [], + "answer": [record["Answer"]], + } From 46e2068a9c714ebd1fe613b98d931e34eda11587 Mon Sep 17 00:00:00 2001 From: giyaseddin Date: Thu, 14 Apr 2022 01:55:05 +0300 Subject: [PATCH 2/6] Change download for more efficiency --- biodatasets/medquad/medquad.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/biodatasets/medquad/medquad.py b/biodatasets/medquad/medquad.py index 8b8d20a5..74305c41 100644 --- a/biodatasets/medquad/medquad.py +++ b/biodatasets/medquad/medquad.py @@ -27,11 +27,11 @@ """ import json import os +import glob import xml.etree.ElementTree as ET from typing import List, Tuple, Dict import datasets -import requests from utils import schemas from utils.configs import BigBioConfig @@ -67,7 +67,7 @@ _DATA_PATH = "https://raw.githubusercontent.com/abachaa/MedQuAD/master" -_DATA_REPO_FETCH_URL = "https://api.github.com/repos/abachaa/MedQuAD/git/trees/master?recursive=1" +_DATA_REPO_FETCH_URL = "https://github.com/abachaa/MedQuAD/archive/refs/heads/master.zip" _SUBSET_BASE_URIS = { "cancergov_qa": "1_CancerGov_QA", @@ -182,22 +182,26 @@ def _dump_xml_to_json(self, dl_manager) -> str: This method parses the dataset """ + repo_extracted = dl_manager.download_and_extract(_DATA_REPO_FETCH_URL) + repo_dir = os.path.join( + repo_extracted, + os.path.basename(_HOMEPAGE) + '-' + os.path.splitext(os.path.basename(_DATA_REPO_FETCH_URL))[0] + ) + if self.config.subset_id == "medquad": file_base_urls = _URLS[f"{self.config.subset_id}_base_urls"] qa_pairs_enriched_fname = f"MedQuADGoldenEnriched/{self.config.subset_id}.json" - repo_files = json.loads(requests.get(_DATA_REPO_FETCH_URL).text) else: raise NotImplementedError("Only full set loader is implemented here") # Collect path info for all repo paths, and determine relevant XML files qa_file_paths = [] for subset_name, uri_ in file_base_urls.items(): - for path in repo_files["tree"]: - if path["type"] == "blob" and path["path"].startswith(uri_) and path["path"].endswith(".xml"): - qa_file_paths.append(os.path.join(_DATA_PATH, path["path"])) + for file_path in glob.glob(os.path.join(repo_dir, uri_, "*.xml")): + qa_file_paths.append(file_path) qa_list, data_dir = self._load_qa_from_xml( - file_paths=dl_manager.download_and_extract(qa_file_paths) + file_paths=qa_file_paths ) qa_pairs_enriched_full_path = os.path.join(data_dir, qa_pairs_enriched_fname) From 85ef3fd4ca8282765ab258ba73919c73f75adf6b Mon Sep 17 00:00:00 2001 From: giyaseddin Date: Thu, 14 Apr 2022 02:33:29 +0300 Subject: [PATCH 3/6] Prevent unnecessary data extraction --- biodatasets/medquad/medquad.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/biodatasets/medquad/medquad.py b/biodatasets/medquad/medquad.py index 74305c41..a207f887 100644 --- a/biodatasets/medquad/medquad.py +++ b/biodatasets/medquad/medquad.py @@ -67,6 +67,8 @@ _DATA_PATH = "https://raw.githubusercontent.com/abachaa/MedQuAD/master" +_TEST_DATA_PATH = "https://raw.githubusercontent.com/abachaa/LiveQA_MedicalTask_TREC2017/master/TestDataset/TREC-2017-LiveQA-Medical-Test.xml" + _DATA_REPO_FETCH_URL = "https://github.com/abachaa/MedQuAD/archive/refs/heads/master.zip" _SUBSET_BASE_URIS = { @@ -86,7 +88,7 @@ _URLS = { "medquad_base_urls": _SUBSET_BASE_URIS, - f"QATestSetMedQrels_judged_answers": f"{_DATA_PATH}/QA-TestSet-LiveQA-Med-Qrels-2479-Answers.zip", + f"QATestSetMedQrels_judged_answers": _TEST_DATA_PATH, } _SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] # TODO: shall we add a non-existing task type such as `RQE`? @@ -175,7 +177,7 @@ def _load_qa_from_xml(self, file_paths) -> Tuple[List[dict], str]: "Answer": answer.text, }) - return qa_list, os.path.dirname(file_paths[0]) + return qa_list def _dump_xml_to_json(self, dl_manager) -> str: """ @@ -200,21 +202,22 @@ def _dump_xml_to_json(self, dl_manager) -> str: for file_path in glob.glob(os.path.join(repo_dir, uri_, "*.xml")): qa_file_paths.append(file_path) - qa_list, data_dir = self._load_qa_from_xml( - file_paths=qa_file_paths - ) + data_dir = os.path.dirname(qa_file_paths[0]) qa_pairs_enriched_full_path = os.path.join(data_dir, qa_pairs_enriched_fname) - qa_pairs_enriched_dir = os.path.dirname(qa_pairs_enriched_full_path) - if not os.path.exists(qa_pairs_enriched_dir): - os.mkdir(qa_pairs_enriched_dir) - if not os.path.exists(qa_pairs_enriched_full_path): - data = {"qa_pairs": qa_list} + qa_list = self._load_qa_from_xml( + file_paths=qa_file_paths + ) + + qa_pairs_enriched_dir = os.path.dirname(qa_pairs_enriched_full_path) + if not os.path.exists(qa_pairs_enriched_dir): + os.mkdir(qa_pairs_enriched_dir) + # dump QA paris to json with open(qa_pairs_enriched_full_path, "wt", encoding="utf-8") as file: - json.dump(data, file, indent=2) + json.dump(qa_list, file, indent=2) return qa_pairs_enriched_full_path @@ -240,7 +243,7 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: if self.config.schema == "source": with open(filepath, encoding="utf-8") as file: data = json.load(file) - for key, record in enumerate(data["qa_pairs"]): + for key, record in enumerate(data): yield key, { "Document": record["Document"], "QAPair": record["QAPair"], @@ -254,7 +257,7 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: with open(filepath, encoding="utf-8") as file: uid = 0 data = json.load(file) - for key, record in enumerate(data["qa_pairs"]): + for key, record in enumerate(data): uid += 1 yield key, { "id": str(uid), From d3545b702f2c52bf76ab81df9e2c5a31730657eb Mon Sep 17 00:00:00 2001 From: giyaseddin Date: Thu, 14 Apr 2022 03:15:32 +0300 Subject: [PATCH 4/6] Add support for data subsets and test --- biodatasets/medquad/medquad.py | 64 +++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/biodatasets/medquad/medquad.py b/biodatasets/medquad/medquad.py index a207f887..9ed49827 100644 --- a/biodatasets/medquad/medquad.py +++ b/biodatasets/medquad/medquad.py @@ -87,8 +87,8 @@ } _URLS = { - "medquad_base_urls": _SUBSET_BASE_URIS, - f"QATestSetMedQrels_judged_answers": _TEST_DATA_PATH, + "medquad_base_uris": _SUBSET_BASE_URIS, + "medquad_test_base_uris": _TEST_DATA_PATH, } _SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] # TODO: shall we add a non-existing task type such as `RQE`? @@ -149,7 +149,7 @@ def _info(self) -> datasets.DatasetInfo: citation=_CITATION, ) - def _load_qa_from_xml(self, file_paths) -> Tuple[List[dict], str]: + def _load_qa_from_xml(self, file_paths) -> List[dict[str, str | None]]: """ This method traverses the whole list of the downloaded XML files and extracts Q&A pairs. Returns the extracted Q&As and the base directory of the dumped json file that contains them all. @@ -179,29 +179,11 @@ def _load_qa_from_xml(self, file_paths) -> Tuple[List[dict], str]: return qa_list - def _dump_xml_to_json(self, dl_manager) -> str: - """ - This method parses the dataset - - """ - repo_extracted = dl_manager.download_and_extract(_DATA_REPO_FETCH_URL) - repo_dir = os.path.join( - repo_extracted, - os.path.basename(_HOMEPAGE) + '-' + os.path.splitext(os.path.basename(_DATA_REPO_FETCH_URL))[0] - ) + def _dump_xml_to_json(self, qa_file_paths, test=False): - if self.config.subset_id == "medquad": - file_base_urls = _URLS[f"{self.config.subset_id}_base_urls"] - qa_pairs_enriched_fname = f"MedQuADGoldenEnriched/{self.config.subset_id}.json" - else: - raise NotImplementedError("Only full set loader is implemented here") + qa_pairs_enriched_fname = f"MedQuADGoldenEnriched/{self.config.subset_id}.json" # Collect path info for all repo paths, and determine relevant XML files - qa_file_paths = [] - for subset_name, uri_ in file_base_urls.items(): - for file_path in glob.glob(os.path.join(repo_dir, uri_, "*.xml")): - qa_file_paths.append(file_path) - data_dir = os.path.dirname(qa_file_paths[0]) qa_pairs_enriched_full_path = os.path.join(data_dir, qa_pairs_enriched_fname) @@ -209,6 +191,8 @@ def _dump_xml_to_json(self, dl_manager) -> str: if not os.path.exists(qa_pairs_enriched_full_path): qa_list = self._load_qa_from_xml( file_paths=qa_file_paths + ) if test else self._load_qa_from_xml( + file_paths=qa_file_paths ) qa_pairs_enriched_dir = os.path.dirname(qa_pairs_enriched_full_path) @@ -221,10 +205,42 @@ def _dump_xml_to_json(self, dl_manager) -> str: return qa_pairs_enriched_full_path + def _dump_test_xml_to_json(self, dl_manager): + file_base_url = _URLS[f"medquad_test_base_uris"] + file_extracted = dl_manager.download_and_extract(file_base_url) + + return self._dump_xml_to_json([file_extracted], test=True) + + def _dump_train_xml_to_json(self, dl_manager) -> str: + """ + This method parses training dataset, or a single batch that belongs to the websites, + please check the repo page. + """ + repo_extracted = dl_manager.download_and_extract(_DATA_REPO_FETCH_URL) + repo_dir = os.path.join( + repo_extracted, + os.path.basename(_HOMEPAGE) + '-' + os.path.splitext(os.path.basename(_DATA_REPO_FETCH_URL))[0] + ) + + if self.config.subset_id == "medquad": + file_base_urls = _URLS[f"medquad_base_uris"] + else: + file_base_urls = [_SUBSET_BASE_URIS[self.config.subset_id]] + + qa_file_paths = [] + for subset_name, uri_ in file_base_urls.items(): + for file_path in glob.glob(os.path.join(repo_dir, uri_, "*.xml")): + qa_file_paths.append(file_path) + + return self._dump_xml_to_json(qa_file_paths) + + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - qa_pairs_enriched_fpath = self._dump_xml_to_json(dl_manager) + qa_pairs_enriched_fpath = self._dump_train_xml_to_json(dl_manager) + qa_pairs_enriched_test_fpath = self._dump_test_xml_to_json(dl_manager) # There is no canonical train/valid/test set in this dataset. So, only TRAIN is added. return [ From 37fd09454e5e673392a75f682cd08f44a8707a30 Mon Sep 17 00:00:00 2001 From: giyaseddin Date: Tue, 17 May 2022 21:34:50 +0300 Subject: [PATCH 5/6] Update download style & add test datset --- biodatasets/medquad/medquad.py | 140 +++++++++++++++++++++++++++------ 1 file changed, 114 insertions(+), 26 deletions(-) diff --git a/biodatasets/medquad/medquad.py b/biodatasets/medquad/medquad.py index 9ed49827..57fddf89 100644 --- a/biodatasets/medquad/medquad.py +++ b/biodatasets/medquad/medquad.py @@ -132,7 +132,7 @@ def _info(self) -> datasets.DatasetInfo: "qid": datasets.Value("string"), "qtype": datasets.Value("string"), "Question": datasets.Value("string"), - "Answer": datasets.Value("string"), + "Answer": datasets.Sequence(datasets.Value("string")), } ) @@ -149,7 +149,7 @@ def _info(self) -> datasets.DatasetInfo: citation=_CITATION, ) - def _load_qa_from_xml(self, file_paths) -> List[dict[str, str | None]]: + def _load_train_qa_from_xml(self, file_paths) -> List[dict[str, str | None]]: """ This method traverses the whole list of the downloaded XML files and extracts Q&A pairs. Returns the extracted Q&As and the base directory of the dumped json file that contains them all. @@ -179,9 +179,78 @@ def _load_qa_from_xml(self, file_paths) -> List[dict[str, str | None]]: return qa_list - def _dump_xml_to_json(self, qa_file_paths, test=False): + def _load_test_qa_from_xml(self, file_paths) -> List[dict[str, str | None]]: + """ + This method traverses the downloaded test XML file and extracts Q&A pairs. + Returns the extracted Q&As and the base directory of the dumped json file that contains them all. + """ + assert len(file_paths) + + qa_list = [] + for file_path in file_paths: + doc_root = ET.parse(file_path).getroot() + for nlm_quetion in doc_root: + qid = nlm_quetion.attrib.get("qid") + + original_question = nlm_quetion[0] + original_question_qfile = original_question.attrib.get("qfile") + original_question_subject = original_question[0] + original_question_message = original_question[1] + nist_paraphrase = nlm_quetion[1] + annotations = nlm_quetion[2] + annotation_focuses, annotation_types, annotation_keywords = [], [], [] + for annotation in annotations: + if annotation.tag == "FOCUS": + annotation_focuses.append({ + "fid": annotation.attrib.get("fid"), + "fcategory": annotation.attrib.get("fcategory"), + "text": annotation.text, + }) + elif annotation.tag == "TYPE": + annotation_focuses.append({ + "tid": annotation.attrib.get("tid"), + "hasFocus": annotation.attrib.get("hasFocus"), + "hasKeyword": annotation.attrib.get("hasKeyword"), + "text": annotation.text, + }) + elif annotation.tag == "KEYWORD": + annotation_focuses.append({ + "kid": annotation.attrib.get("kid"), + "kcategory": annotation.attrib.get("kcategory"), + "text": annotation.text, + }) + + reference_answers = nlm_quetion[3] + ref_answers = [] + for ref_answer in reference_answers: + ref_answers.append({ + "aid": ref_answer.attrib.get("aid"), + "ANSWER": ref_answer[0].text, + "AnswerURL": ref_answer[1].text, + "COMMENT": ref_answer[2].text, + }) + + qa_list.append({ + "qid": qid, + "Original-Question": { + "qfile": original_question_qfile, + "SUBJECT": original_question_subject.text, + "MESSAGE": original_question_message.text, + }, + "NIST-PARAPHRASE": nist_paraphrase.text, + "ANNOTATIONS": { + "FOCUS": annotation_focuses, + "TYPE": annotation_types, + "KEYWORD": annotation_keywords, + }, + "ReferenceAnswers": ref_answers, + }) - qa_pairs_enriched_fname = f"MedQuADGoldenEnriched/{self.config.subset_id}.json" + return qa_list + + def _dump_xml_to_json(self, qa_file_paths, split): + + qa_pairs_enriched_fname = f"MedQuADGoldenEnriched/{self.config.subset_id}_{split}.json" # Collect path info for all repo paths, and determine relevant XML files data_dir = os.path.dirname(qa_file_paths[0]) @@ -189,11 +258,14 @@ def _dump_xml_to_json(self, qa_file_paths, test=False): qa_pairs_enriched_full_path = os.path.join(data_dir, qa_pairs_enriched_fname) if not os.path.exists(qa_pairs_enriched_full_path): - qa_list = self._load_qa_from_xml( - file_paths=qa_file_paths - ) if test else self._load_qa_from_xml( - file_paths=qa_file_paths - ) + if split == datasets.Split.TEST: + qa_list = self._load_test_qa_from_xml( + file_paths=qa_file_paths + ) + else: + qa_list = self._load_train_qa_from_xml( + file_paths=qa_file_paths + ) qa_pairs_enriched_dir = os.path.dirname(qa_pairs_enriched_full_path) if not os.path.exists(qa_pairs_enriched_dir): @@ -201,15 +273,15 @@ def _dump_xml_to_json(self, qa_file_paths, test=False): # dump QA paris to json with open(qa_pairs_enriched_full_path, "wt", encoding="utf-8") as file: - json.dump(qa_list, file, indent=2) + json.dump(qa_list, file) return qa_pairs_enriched_full_path def _dump_test_xml_to_json(self, dl_manager): - file_base_url = _URLS[f"medquad_test_base_uris"] + file_base_url = _URLS["medquad_test_base_uris"] file_extracted = dl_manager.download_and_extract(file_base_url) - return self._dump_xml_to_json([file_extracted], test=True) + return self._dump_xml_to_json([file_extracted], split=datasets.Split.TEST) def _dump_train_xml_to_json(self, dl_manager) -> str: """ @@ -232,14 +304,12 @@ def _dump_train_xml_to_json(self, dl_manager) -> str: for file_path in glob.glob(os.path.join(repo_dir, uri_, "*.xml")): qa_file_paths.append(file_path) - return self._dump_xml_to_json(qa_file_paths) - - + return self._dump_xml_to_json(qa_file_paths, split=datasets.Split.TRAIN) def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - qa_pairs_enriched_fpath = self._dump_train_xml_to_json(dl_manager) + qa_pairs_enriched_train_fpath = self._dump_train_xml_to_json(dl_manager) qa_pairs_enriched_test_fpath = self._dump_test_xml_to_json(dl_manager) # There is no canonical train/valid/test set in this dataset. So, only TRAIN is added. @@ -247,8 +317,15 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ - "filepath": qa_pairs_enriched_fpath, - "split": "train", + "filepath": qa_pairs_enriched_train_fpath, + "split": datasets.Split.TRAIN, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": qa_pairs_enriched_test_fpath, + "split": datasets.Split.TEST, }, ), ] @@ -260,14 +337,25 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: with open(filepath, encoding="utf-8") as file: data = json.load(file) for key, record in enumerate(data): - yield key, { - "Document": record["Document"], - "QAPair": record["QAPair"], - "qid": record["qid"], - "qtype": record["qtype"], - "Question": record["Question"], - "Answer": record["Answer"], - } + if split == datasets.Split.TEST: + yield key, { + "Document": None, + "QAPair": None, + "qid": record["qid"], + "qtype": record["Original-Question"]["SUBJECT"], + # A paraphrased verions of record["Original-Question"]["MESSAGE"] + "Question": record["NIST-PARAPHRASE"], + "Answer": [ref["ANSWER"] for ref in record["ReferenceAnswers"]], + } + else: + yield key, { + "Document": record["Document"], + "QAPair": record["QAPair"], + "qid": record["qid"], + "qtype": record["qtype"], + "Question": record["Question"], + "Answer": [record["Answer"]], + } elif self.config.schema == "bigbio_qa": with open(filepath, encoding="utf-8") as file: From 8785975ab5dded129f4b0072b338481f886c2d66 Mon Sep 17 00:00:00 2001 From: giyaseddin Date: Wed, 18 May 2022 17:44:16 +0300 Subject: [PATCH 6/6] Fix bigbio_qa fromat --- biodatasets/medquad/medquad.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/biodatasets/medquad/medquad.py b/biodatasets/medquad/medquad.py index 57fddf89..bfd521b6 100644 --- a/biodatasets/medquad/medquad.py +++ b/biodatasets/medquad/medquad.py @@ -32,6 +32,7 @@ from typing import List, Tuple, Dict import datasets +from pyarrow.dataset import dataset from utils import schemas from utils.configs import BigBioConfig @@ -333,9 +334,9 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - if self.config.schema == "source": - with open(filepath, encoding="utf-8") as file: - data = json.load(file) + with open(filepath, encoding="utf-8") as file: + data = json.load(file) + if self.config.schema == "source": for key, record in enumerate(data): if split == datasets.Split.TEST: yield key, { @@ -357,19 +358,17 @@ def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: "Answer": [record["Answer"]], } - elif self.config.schema == "bigbio_qa": - with open(filepath, encoding="utf-8") as file: + elif self.config.schema == "bigbio_qa": uid = 0 - data = json.load(file) for key, record in enumerate(data): uid += 1 yield key, { "id": str(uid), - "document_id": record["Document"], + "document_id": record.get("Document"), "question_id": record["qid"], - "question": record["Question"], - "type": record["qtype"], + "question": record.get("Question"), + "type": record.get("qtype"), "choices": [], "context": [], - "answer": [record["Answer"]], + "answer": [record.get("Answer")], }