bigscience-workshop · ruisi-su · May 24, 2022 · May 24, 2022 · May 25, 2022 · May 25, 2022
@@ -0,0 +1,60 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+# See https://github.com/crmne/cookiecutter-modern-datascience
+fail_fast: true
+exclude: '^$'
+files: ^bigbio/biodatasets/
+repos:
+    -   repo: https://github.com/pre-commit/pre-commit-hooks
+        rev: v4.3.0
+        hooks:
+        -   id: trailing-whitespace
+        -   id: end-of-file-fixer
+        -   id: check-yaml
+        -   id: check-case-conflict
+        -   id: debug-statements
+        -   id: detect-private-key
+        -   id: check-merge-conflict
+        -   id: check-added-large-files
+    # -   repo: https://github.com/myint/autoflake
+    #     rev: v1.7.6
+    #     hooks:
+    #     -   id: autoflake
+    #         args:
+    #           - --in-place
+    #           - --remove-duplicate-keys
+    #           - --remove-unused-variables
+    #           - --remove-all-unused-imports
+    #           - --expand-star-imports
+    -   repo: https://github.com/PyCQA/flake8
+        rev: 5.0.4
+        hooks:
+        -   id: flake8
+            args:
+                - --max-line-length
+                - '119'
+    -   repo: https://github.com/PyCQA/isort
+        rev: 5.10.1
+        hooks:
+        -   id: isort
+            args:
+              - --profile
+              - black
+    -   repo: https://github.com/ambv/black
+        rev: 22.10.0
+        hooks:
+        -   id: black
+            args:
+              - --line-length
+              - '119'
+              - --target-version
+              - py38
+    -   repo: local
+        hooks:
+        -   id: test-bigbio
+            name: running bigbio unit tests
+            entry: python -m tests.test_bigbio
+            language: system
+            files: ^bigbio/biodatasets/
+            pass_filenames: true
+            # always_run: true
@@ -54,11 +54,11 @@
 _DISPLAYNAME = "AnEM"
 
 _DESCRIPTION = """\
-AnEM corpus is a domain- and species-independent resource manually annotated for anatomical
-entity mentions using a fine-grained classification system. The corpus consists of 500 documents
-(over 90,000 words) selected randomly from citation abstracts and full-text papers with
-the aim of making the corpus representative of the entire available biomedical scientific
-literature. The corpus annotation covers mentions of both healthy and pathological anatomical
+AnEM corpus is a domain- and species-independent resource manually annotated for anatomical \
+entity mentions using a fine-grained classification system. The corpus consists of 500 documents \
+(over 90,000 words) selected randomly from citation abstracts and full-text papers with \
+the aim of making the corpus representative of the entire available biomedical scientific \
+literature. The corpus annotation covers mentions of both healthy and pathological anatomical \
 entities and contains over 3,000 annotated mentions.
 """
 
@@ -167,10 +167,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
                     "filepath": all_data,
-                    "split_path": data_dir
-                    / "AnEM-1.0.4"
-                    / "development"
-                    / "train-files.list",
+                    "split_path": data_dir / "AnEM-1.0.4" / "development" / "train-files.list",
                     "split": "train",
                 },
             ),
@@ -186,10 +183,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
                     "filepath": all_data,
-                    "split_path": data_dir
-                    / "AnEM-1.0.4"
-                    / "development"
-                    / "test-files.list",
+                    "split_path": data_dir / "AnEM-1.0.4" / "development" / "test-files.list",
                     "split": "dev",
                 },
             ),
@@ -251,10 +245,7 @@ def _brat_to_source(self, filepath, brat_example):
             "equivalences": [
                 {
                     "entity_id": brat_entity["id"],
-                    "ref_ids": [
-                        f"{brat_example['document_id']}_{ids}"
-                        for ids in brat_entity["ref_ids"]
-                    ],
+                    "ref_ids": [f"{brat_example['document_id']}_{ids}" for ids in brat_entity["ref_ids"]],
                 }
                 for brat_entity in brat_example["equivalences"]
             ],

@@ -235,7 +235,7 @@ def gen_latex(dataset_name, helper, splits, schemas, fig_path):
         r"Token frequency distribution by split (top) and frequency of different kind of instances (bottom).}"
         + "\n"
     )
-    latex_bod += r"\end{figure}" + "\n" + r"\textbf{Dataset Description} "
+    latex_bod += r"\end{figure}" + "\n" + r"\textbf{Dataset Description:} "
     latex_bod += (
         fr"{descriptions}"
         + "\n"
@@ -403,4 +403,3 @@ def draw_figure(data_name, data_config_name, schema_type):
         latex_name = f"{data_name}_{config_name}.tex"
         write_latex(latex_bod, latex_name)
         print(latex_bod)
-