From 8c9136ae529e031620c11db07a9eb220de8c8a68 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 10 Oct 2022 17:58:42 +0000
Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ac_dc/deduplicate/self_deduplicate.py               | 3 +--
 bertin/evaluation/run_glue.py                       | 1 -
 bertin/evaluation/run_ner.py                        | 1 -
 bertin/mc4/mc4.py                                   | 2 +-
 bertin/run_mlm_flax.py                              | 1 -
 bertin/run_mlm_flax_stream.py                       | 3 +--
 bertin/utils/dataset_perplexity.py                  | 2 +-
 cc_pseudo_crawl/python_scripts/download_warc.py     | 4 ++--
 cc_pseudo_crawl/python_scripts/load_all_seed_ids.py | 2 +-
 kenlm_training/cc_net/jsonql.py                     | 5 ++---
 pii-manager/setup.py                                | 4 ++--
 pii-manager/src/pii_manager/api/manager.py          | 9 +++------
 pii-manager/test/unit/api/test_file.py              | 2 +-
 pii-manager/test/unit/api/test_file_taskfile.py     | 2 +-
 14 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/ac_dc/deduplicate/self_deduplicate.py b/ac_dc/deduplicate/self_deduplicate.py
index 74cf88b2..be8008e2 100644
--- a/ac_dc/deduplicate/self_deduplicate.py
+++ b/ac_dc/deduplicate/self_deduplicate.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 # @Date       : 2022-01-08 22:39:29
 # @Author     : Chenghao Mou (mouchenghao@gmail.com)
 # @Description: Self-deduplication with `datasets`
@@ -28,7 +27,7 @@
 
 def main(conf: str) -> None:
 
-    with open(conf, "r") as f:
+    with open(conf) as f:
         conf = yaml.safe_load(f.read())
 
     if conf["load_from_disk"]["path"]:
diff --git a/bertin/evaluation/run_glue.py b/bertin/evaluation/run_glue.py
index a08cba25..93ebb264 100644
--- a/bertin/evaluation/run_glue.py
+++ b/bertin/evaluation/run_glue.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/bertin/evaluation/run_ner.py b/bertin/evaluation/run_ner.py
index dbd9cd9a..d6139f13 100644
--- a/bertin/evaluation/run_ner.py
+++ b/bertin/evaluation/run_ner.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/bertin/mc4/mc4.py b/bertin/mc4/mc4.py
index 923e5e20..5f915d4f 100644
--- a/bertin/mc4/mc4.py
+++ b/bertin/mc4/mc4.py
@@ -404,7 +404,7 @@ def _generate_examples(self, filepaths):
         for filepath in filepaths:
             logger.info("generating examples from = %s", filepath)
             if filepath.endswith("jsonl"):
-                with open(filepath, "r", encoding="utf-8") as f:
+                with open(filepath, encoding="utf-8") as f:
                     for line in f:
                         if line:
                             example = json.loads(line)
diff --git a/bertin/run_mlm_flax.py b/bertin/run_mlm_flax.py
index 54251b94..a0bea219 100644
--- a/bertin/run_mlm_flax.py
+++ b/bertin/run_mlm_flax.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2021 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/bertin/run_mlm_flax_stream.py b/bertin/run_mlm_flax_stream.py
index a33eaae1..1335bcf8 100644
--- a/bertin/run_mlm_flax_stream.py
+++ b/bertin/run_mlm_flax_stream.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2021 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -446,7 +445,7 @@ def restore_checkpoint(save_dir, state):
     args = joblib.load(os.path.join(save_dir, "training_args.joblib"))
     data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib"))
 
-    with open(os.path.join(save_dir, "training_state.json"), "r") as f:
+    with open(os.path.join(save_dir, "training_state.json")) as f:
         training_state = json.load(f)
     step = training_state["step"]
 
diff --git a/bertin/utils/dataset_perplexity.py b/bertin/utils/dataset_perplexity.py
index 2ca470c8..ecf02308 100644
--- a/bertin/utils/dataset_perplexity.py
+++ b/bertin/utils/dataset_perplexity.py
@@ -17,7 +17,7 @@ def get_perplexity(doc):
 
 
 with open("mc4-es-train-50M-stats.csv", "w") as csv:
-    with open("mc4-es-train-50M-steps.jsonl", "r") as data:
+    with open("mc4-es-train-50M-steps.jsonl") as data:
         for line in tqdm(data):
             text = json.loads(line)["text"]
             csv.write(f"{len(text.split())},{get_perplexity(text)}\n")
diff --git a/cc_pseudo_crawl/python_scripts/download_warc.py b/cc_pseudo_crawl/python_scripts/download_warc.py
index a5699c76..0ba1bde1 100644
--- a/cc_pseudo_crawl/python_scripts/download_warc.py
+++ b/cc_pseudo_crawl/python_scripts/download_warc.py
@@ -143,9 +143,9 @@ def get_warcs(batch):
         existing_compressed_warcs,
     )
 
-    batch["compressed_warc"], batch["download_exception"] = [
+    batch["compressed_warc"], batch["download_exception"] = (
         list(l) for l in zip(*warcs_or_exceptions)
-    ]
+    )
     return batch
 
 
diff --git a/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py b/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
index 1753acf8..f605e7b9 100644
--- a/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
+++ b/cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
@@ -21,7 +21,7 @@ def main():
 
     seed_ids = []
     for seed_path in args.seed_paths:
-        with open(seed_path, "r") as fi:
+        with open(seed_path) as fi:
             data = csv.reader(fi)
             # First line is all the headers that we remove.
             seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0]
diff --git a/kenlm_training/cc_net/jsonql.py b/kenlm_training/cc_net/jsonql.py
index 0ff57f23..ba290a91 100644
--- a/kenlm_training/cc_net/jsonql.py
+++ b/kenlm_training/cc_net/jsonql.py
@@ -880,8 +880,7 @@ def describe(source, columns=None, weights=None, **kwargs):
             continue
         if "." in k or k == ALL_DOCUMENTS:
             continue
-        for line in display_stats(stats, k, weights=weights, **kwargs):
-            yield line
+        yield from display_stats(stats, k, weights=weights, **kwargs)
 
 
 def shard(lines):
@@ -961,7 +960,7 @@ def open_read(filename: ReadableFileLike) -> Iterable[str]:
     if filename.suffix == ".gz":
         file: TextIO = gzip.open(filename, "rt")  # type: ignore
     else:
-        file = open(filename, "rt")
+        file = open(filename)
 
     return _close_when_exhausted(file)
 
diff --git a/pii-manager/setup.py b/pii-manager/setup.py
index c5b0714b..4c369204 100644
--- a/pii-manager/setup.py
+++ b/pii-manager/setup.py
@@ -27,7 +27,7 @@
 
 def requirements(filename="requirements.txt"):
     """Read the requirements file"""
-    with io.open(filename, "r") as f:
+    with open(filename) as f:
         return [line.strip() for line in f if line and line[0] != "#"]
 
 
@@ -35,7 +35,7 @@ def long_description():
     """
     Take the README and remove markdown hyperlinks
     """
-    with open("README.md", "rt", encoding="utf-8") as f:
+    with open("README.md", encoding="utf-8") as f:
         desc = f.read()
         desc = re.sub(r"^\[ ([^\]]+) \]: \s+ \S.*\n", r"", desc, flags=re.X | re.M)
         return re.sub(r"\[ ([^\]]+) \]", r"\1", desc, flags=re.X)
diff --git a/pii-manager/src/pii_manager/api/manager.py b/pii-manager/src/pii_manager/api/manager.py
index cdb3d7dd..0de1a190 100644
--- a/pii-manager/src/pii_manager/api/manager.py
+++ b/pii-manager/src/pii_manager/api/manager.py
@@ -31,13 +31,11 @@ def fetch_all_tasks(
     """
     taskdict = get_taskdict(debug=debug)
     # Language-independent
-    for task in taskdict[LANG_ANY].values():
-        yield task
+    yield from taskdict[LANG_ANY].values()
 
     langdict = taskdict.get(lang, {})
     # Country-independent
-    for task in langdict.get(COUNTRY_ANY, {}).values():
-        yield task
+    yield from langdict.get(COUNTRY_ANY, {}).values()
     # Country-specific
     if country:
         if country[0] in (COUNTRY_ANY, "all"):
@@ -45,8 +43,7 @@ def fetch_all_tasks(
         for c in country:
             if c == COUNTRY_ANY:  # already included above
                 continue
-            for task in langdict.get(c, {}).values():
-                yield task
+            yield from langdict.get(c, {}).values()
 
 
 def fetch_task(
diff --git a/pii-manager/test/unit/api/test_file.py b/pii-manager/test/unit/api/test_file.py
index 6d5932c6..0578c67b 100644
--- a/pii-manager/test/unit/api/test_file.py
+++ b/pii-manager/test/unit/api/test_file.py
@@ -12,7 +12,7 @@ def datafile(name: str) -> str:
 
 
 def readfile(name: str) -> str:
-    with open(name, "rt", encoding="utf-8") as f:
+    with open(name, encoding="utf-8") as f:
         return f.read().strip()
 
 
diff --git a/pii-manager/test/unit/api/test_file_taskfile.py b/pii-manager/test/unit/api/test_file_taskfile.py
index 722dfa8f..3371d7b7 100644
--- a/pii-manager/test/unit/api/test_file_taskfile.py
+++ b/pii-manager/test/unit/api/test_file_taskfile.py
@@ -14,7 +14,7 @@ def datafile(name: str) -> str:
 
 
 def readfile(name: str) -> str:
-    with open(name, "rt", encoding="utf-8") as f:
+    with open(name, encoding="utf-8") as f:
         return f.read().strip()