diff --git a/ac_dc/anonymization.py b/ac_dc/anonymization.py index af61230f..e5fed667 100644 --- a/ac_dc/anonymization.py +++ b/ac_dc/anonymization.py @@ -30,7 +30,7 @@ def apply_regex_anonymization( tag_type=tag_type, ) if anonymize_condition: - for (ent, start, end, tag) in ner: + for ent, start, end, tag in ner: # we need to actually walk through and replace by start, end span. sentence = sentence.replace(ent, f" <{tag}> ") return sentence, ner diff --git a/ac_dc/deduplicate/self_deduplicate.py b/ac_dc/deduplicate/self_deduplicate.py index 74cf88b2..23eb1056 100644 --- a/ac_dc/deduplicate/self_deduplicate.py +++ b/ac_dc/deduplicate/self_deduplicate.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # @Date : 2022-01-08 22:39:29 # @Author : Chenghao Mou (mouchenghao@gmail.com) # @Description: Self-deduplication with `datasets` @@ -27,8 +26,7 @@ def main(conf: str) -> None: - - with open(conf, "r") as f: + with open(conf) as f: conf = yaml.safe_load(f.read()) if conf["load_from_disk"]["path"]: @@ -201,5 +199,4 @@ def main(conf: str) -> None: if __name__ == "__main__": - typer.run(main) diff --git a/ac_dc/visualization/get_data_for_visualization.py b/ac_dc/visualization/get_data_for_visualization.py index 55c241f7..a07a015a 100644 --- a/ac_dc/visualization/get_data_for_visualization.py +++ b/ac_dc/visualization/get_data_for_visualization.py @@ -21,7 +21,6 @@ def __init__( path_kenlm_model, path_save_stats, ): - self.ds = dataset self.num_iter = num_iter @@ -166,7 +165,6 @@ def compute_stats(self): if __name__ == "__main__": - lang_dataset_id = "en" dataset_name = "oscar" # "TurkuNLP/register_oscar" diff --git a/ac_dc/visualization/visualization.py b/ac_dc/visualization/visualization.py index 3e532aa4..4c0315b1 100644 --- a/ac_dc/visualization/visualization.py +++ b/ac_dc/visualization/visualization.py @@ -625,7 +625,6 @@ def filtering_of_words(self): ) if display_discarded_words_by_filter: - if "len_word" in columns: cond_filter = np.invert(conds_words["len_word"]) Visualization_for_lang.display_dataset( @@ -698,7 +697,6 @@ def is_doc_discarded(key, score): return score < key[1] if personal_doc: - st.markdown("Statistics of the document:") for key in self.keys: diff --git a/bertin/evaluation/run_glue.py b/bertin/evaluation/run_glue.py index a08cba25..93ebb264 100644 --- a/bertin/evaluation/run_glue.py +++ b/bertin/evaluation/run_glue.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/bertin/evaluation/run_ner.py b/bertin/evaluation/run_ner.py index dbd9cd9a..d6139f13 100644 --- a/bertin/evaluation/run_ner.py +++ b/bertin/evaluation/run_ner.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding=utf-8 # Copyright 2020 The HuggingFace Team All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/bertin/mc4/mc4.py b/bertin/mc4/mc4.py index 923e5e20..5f915d4f 100644 --- a/bertin/mc4/mc4.py +++ b/bertin/mc4/mc4.py @@ -404,7 +404,7 @@ def _generate_examples(self, filepaths): for filepath in filepaths: logger.info("generating examples from = %s", filepath) if filepath.endswith("jsonl"): - with open(filepath, "r", encoding="utf-8") as f: + with open(filepath, encoding="utf-8") as f: for line in f: if line: example = json.loads(line) diff --git a/bertin/run_mlm_flax.py b/bertin/run_mlm_flax.py index 54251b94..a0bea219 100644 --- a/bertin/run_mlm_flax.py +++ b/bertin/run_mlm_flax.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding=utf-8 # Copyright 2021 The HuggingFace Team All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/bertin/run_mlm_flax_stream.py b/bertin/run_mlm_flax_stream.py index a33eaae1..1335bcf8 100644 --- a/bertin/run_mlm_flax_stream.py +++ b/bertin/run_mlm_flax_stream.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding=utf-8 # Copyright 2021 The HuggingFace Team All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -446,7 +445,7 @@ def restore_checkpoint(save_dir, state): args = joblib.load(os.path.join(save_dir, "training_args.joblib")) data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib")) - with open(os.path.join(save_dir, "training_state.json"), "r") as f: + with open(os.path.join(save_dir, "training_state.json")) as f: training_state = json.load(f) step = training_state["step"] diff --git a/bertin/utils/dataset_perplexity.py b/bertin/utils/dataset_perplexity.py index 2ca470c8..ecf02308 100644 --- a/bertin/utils/dataset_perplexity.py +++ b/bertin/utils/dataset_perplexity.py @@ -17,7 +17,7 @@ def get_perplexity(doc): with open("mc4-es-train-50M-stats.csv", "w") as csv: - with open("mc4-es-train-50M-steps.jsonl", "r") as data: + with open("mc4-es-train-50M-steps.jsonl") as data: for line in tqdm(data): text = json.loads(line)["text"] csv.write(f"{len(text.split())},{get_perplexity(text)}\n") diff --git a/cc_pseudo_crawl/python_scripts/download_warc.py b/cc_pseudo_crawl/python_scripts/download_warc.py index a5699c76..0ba1bde1 100644 --- a/cc_pseudo_crawl/python_scripts/download_warc.py +++ b/cc_pseudo_crawl/python_scripts/download_warc.py @@ -143,9 +143,9 @@ def get_warcs(batch): existing_compressed_warcs, ) - batch["compressed_warc"], batch["download_exception"] = [ + batch["compressed_warc"], batch["download_exception"] = ( list(l) for l in zip(*warcs_or_exceptions) - ] + ) return batch diff --git a/cc_pseudo_crawl/python_scripts/extract_text/extract_text_and_html_metadata.py b/cc_pseudo_crawl/python_scripts/extract_text/extract_text_and_html_metadata.py index dd6c140a..b157c19d 100644 --- a/cc_pseudo_crawl/python_scripts/extract_text/extract_text_and_html_metadata.py +++ b/cc_pseudo_crawl/python_scripts/extract_text/extract_text_and_html_metadata.py @@ -431,7 +431,6 @@ def main(args: PreprocessingConfig) -> None: # Setup logging ] def process_file(file_name: str): - logger.info(config.HF_DATASETS_CACHE) processing_name = ( "-".join(args.metadata_to_include) diff --git a/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py b/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py index 1753acf8..f605e7b9 100644 --- a/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py +++ b/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py @@ -21,7 +21,7 @@ def main(): seed_ids = [] for seed_path in args.seed_paths: - with open(seed_path, "r") as fi: + with open(seed_path) as fi: data = csv.reader(fi) # First line is all the headers that we remove. seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0] diff --git a/kenlm_training/cc_net/execution.py b/kenlm_training/cc_net/execution.py index 6ab09a56..435be74d 100644 --- a/kenlm_training/cc_net/execution.py +++ b/kenlm_training/cc_net/execution.py @@ -42,7 +42,6 @@ def get_executor( task_parallelism: int = -1, options: dict = {}, ) -> Executor: - execution_mode = execution.split(",")[0] options.update( {kv.split("=", 1)[0]: kv.split("=", 1)[1] for kv in execution.split(",")[1:]} diff --git a/kenlm_training/cc_net/jsonql.py b/kenlm_training/cc_net/jsonql.py index 0ff57f23..9694734d 100644 --- a/kenlm_training/cc_net/jsonql.py +++ b/kenlm_training/cc_net/jsonql.py @@ -880,8 +880,7 @@ def describe(source, columns=None, weights=None, **kwargs): continue if "." in k or k == ALL_DOCUMENTS: continue - for line in display_stats(stats, k, weights=weights, **kwargs): - yield line + yield from display_stats(stats, k, weights=weights, **kwargs) def shard(lines): @@ -961,7 +960,7 @@ def open_read(filename: ReadableFileLike) -> Iterable[str]: if filename.suffix == ".gz": file: TextIO = gzip.open(filename, "rt") # type: ignore else: - file = open(filename, "rt") + file = open(filename) return _close_when_exhausted(file) @@ -1015,7 +1014,7 @@ def open_write( if filename.suffix == ".gz": return BlockedGzipWriter(Path(filename), mode, block_size="64M") - return open(filename, "wt") + return open(filename, "w") def parse_size(size): diff --git a/kenlm_training/tests/test_jsonql.py b/kenlm_training/tests/test_jsonql.py index 7d9768e7..c8e5060c 100644 --- a/kenlm_training/tests/test_jsonql.py +++ b/kenlm_training/tests/test_jsonql.py @@ -262,7 +262,7 @@ def do(self, x): def acc(values): print("acc: started") res = 0 - for (x, _) in values: + for x, _ in values: res += int(x) print("acc: done") yield f"acc: result={res}" diff --git a/pii-manager/setup.py b/pii-manager/setup.py index c5b0714b..4c369204 100644 --- a/pii-manager/setup.py +++ b/pii-manager/setup.py @@ -27,7 +27,7 @@ def requirements(filename="requirements.txt"): """Read the requirements file""" - with io.open(filename, "r") as f: + with open(filename) as f: return [line.strip() for line in f if line and line[0] != "#"] @@ -35,7 +35,7 @@ def long_description(): """ Take the README and remove markdown hyperlinks """ - with open("README.md", "rt", encoding="utf-8") as f: + with open("README.md", encoding="utf-8") as f: desc = f.read() desc = re.sub(r"^\[ ([^\]]+) \]: \s+ \S.*\n", r"", desc, flags=re.X | re.M) return re.sub(r"\[ ([^\]]+) \]", r"\1", desc, flags=re.X) diff --git a/pii-manager/src/pii_manager/api/manager.py b/pii-manager/src/pii_manager/api/manager.py index cdb3d7dd..0de1a190 100644 --- a/pii-manager/src/pii_manager/api/manager.py +++ b/pii-manager/src/pii_manager/api/manager.py @@ -31,13 +31,11 @@ def fetch_all_tasks( """ taskdict = get_taskdict(debug=debug) # Language-independent - for task in taskdict[LANG_ANY].values(): - yield task + yield from taskdict[LANG_ANY].values() langdict = taskdict.get(lang, {}) # Country-independent - for task in langdict.get(COUNTRY_ANY, {}).values(): - yield task + yield from langdict.get(COUNTRY_ANY, {}).values() # Country-specific if country: if country[0] in (COUNTRY_ANY, "all"): @@ -45,8 +43,7 @@ def fetch_all_tasks( for c in country: if c == COUNTRY_ANY: # already included above continue - for task in langdict.get(c, {}).values(): - yield task + yield from langdict.get(c, {}).values() def fetch_task( diff --git a/pii-manager/test/unit/api/test_file.py b/pii-manager/test/unit/api/test_file.py index 6d5932c6..0578c67b 100644 --- a/pii-manager/test/unit/api/test_file.py +++ b/pii-manager/test/unit/api/test_file.py @@ -12,7 +12,7 @@ def datafile(name: str) -> str: def readfile(name: str) -> str: - with open(name, "rt", encoding="utf-8") as f: + with open(name, encoding="utf-8") as f: return f.read().strip() diff --git a/pii-manager/test/unit/api/test_file_taskfile.py b/pii-manager/test/unit/api/test_file_taskfile.py index 722dfa8f..3371d7b7 100644 --- a/pii-manager/test/unit/api/test_file_taskfile.py +++ b/pii-manager/test/unit/api/test_file_taskfile.py @@ -14,7 +14,7 @@ def datafile(name: str) -> str: def readfile(name: str) -> str: - with open(name, "rt", encoding="utf-8") as f: + with open(name, encoding="utf-8") as f: return f.read().strip() diff --git a/pii-manager/test/unit/api/test_manager.py b/pii-manager/test/unit/api/test_manager.py index 5f74dbf6..8b874a93 100644 --- a/pii-manager/test/unit/api/test_manager.py +++ b/pii-manager/test/unit/api/test_manager.py @@ -21,7 +21,10 @@ def test20_info(): info = obj.task_info() exp = { - (PiiEnum.CREDIT_CARD, None,): [ + ( + PiiEnum.CREDIT_CARD, + None, + ): [ ( "credit card", "Credit card numbers for most international credit cards (detect & validate)", diff --git a/pii-manager/test/unit/api/test_manager_add.py b/pii-manager/test/unit/api/test_manager_add.py index a61e5eee..152d9eb7 100644 --- a/pii-manager/test/unit/api/test_manager_add.py +++ b/pii-manager/test/unit/api/test_manager_add.py @@ -47,7 +47,7 @@ def test110_call(): obj = PiiManager("en", None, PiiEnum.EMAIL_ADDRESS) obj.add_tasks([DUMMY_REGEX]) - for (doc, exp) in TEST_REGEX: + for doc, exp in TEST_REGEX: got = obj(doc) assert got == exp @@ -86,6 +86,6 @@ def test200_call(): obj = PiiManager("en") obj.add_tasks([DUMMY_CLASS]) - for (doc, exp) in TEST_CLASS: + for doc, exp in TEST_CLASS: got = obj(doc) assert got == exp diff --git a/pii-manager/test/unit/api/test_manager_ctx.py b/pii-manager/test/unit/api/test_manager_ctx.py index f74701fc..c475498e 100644 --- a/pii-manager/test/unit/api/test_manager_ctx.py +++ b/pii-manager/test/unit/api/test_manager_ctx.py @@ -38,7 +38,7 @@ def test10_context_regex(): """ obj = PiiManager("en", mode="extract") obj.add_tasks([DUMMY_REGEX]) - for (text, exp) in TEST: + for text, exp in TEST: got = obj(text) assert list(got) == exp @@ -64,6 +64,6 @@ def test20_context_class(): """ obj = PiiManager("en", mode="extract") obj.add_tasks([DUMMY_CLASS]) - for (text, exp) in TEST: + for text, exp in TEST: got = obj(text) assert list(got) == exp diff --git a/pii-manager/test/unit/helper/test_context.py b/pii-manager/test/unit/helper/test_context.py index 6a158864..e59665fd 100644 --- a/pii-manager/test/unit/helper/test_context.py +++ b/pii-manager/test/unit/helper/test_context.py @@ -74,7 +74,7 @@ def test10_context_true(): """ Check valid contexts """ - for (text, context) in TEST_TRUE: + for text, context in TEST_TRUE: spec = mod.context_spec(context) assert mod.context_check(text, spec, 20) is True @@ -83,7 +83,7 @@ def test20_context_false(): """ Check invalid contexts """ - for (text, context) in TEST_FALSE: + for text, context in TEST_FALSE: spec = mod.context_spec(context) assert mod.context_check(text, spec, 20) is False diff --git a/pii-manager/test/unit/helper/test_norm.py b/pii-manager/test/unit/helper/test_norm.py index 0a1b73ba..f9772dd1 100644 --- a/pii-manager/test/unit/helper/test_norm.py +++ b/pii-manager/test/unit/helper/test_norm.py @@ -8,5 +8,5 @@ def test10_normalizer(): """ Create base object """ - for (text, exp) in TEST: + for text, exp in TEST: assert mod.normalize(text, "en", whitespace=True, lowercase=True) == exp diff --git a/tokenizer/python_script/dedup_lines.py b/tokenizer/python_script/dedup_lines.py index ea3e4a81..d08f11ee 100644 --- a/tokenizer/python_script/dedup_lines.py +++ b/tokenizer/python_script/dedup_lines.py @@ -28,6 +28,7 @@ META_COLUMNS = ["meta"] + # filter text to remove certain lines (e.g. menu items, copyright notice) def filter_lines(article, skip_set, used_lines): # TODO discuss the strip