From b178b514519550e355baf0f4f3f6beb73dca7df2 Mon Sep 17 00:00:00 2001 From: Nick Smirnov <125620385+smbrine@users.noreply.github.com> Date: Wed, 7 Feb 2024 21:59:32 +0300 Subject: [PATCH] feat(bulk-ingest): Add --ignored Flag to Exclude Specific Files and Directories During Ingestion (#1432) --- .gitignore | 2 ++ Makefile | 17 +++++++++++++++++ scripts/ingest_folder.py | 29 ++++++++++++++++++++++------- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index bc7d21253..847a30db3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ .venv +.env +venv settings-me.yaml diff --git a/Makefile b/Makefile index a2e2d8d3c..67b76e40a 100644 --- a/Makefile +++ b/Makefile @@ -56,3 +56,20 @@ wipe: setup: poetry run python scripts/setup + +list: + @echo "Available commands:" + @echo " test : Run tests using pytest" + @echo " test-coverage : Run tests with coverage report" + @echo " black : Check code format with black" + @echo " ruff : Check code with ruff" + @echo " format : Format code with black and ruff" + @echo " mypy : Run mypy for type checking" + @echo " check : Run format and mypy commands" + @echo " run : Run the application" + @echo " dev-windows : Run the application in development mode on Windows" + @echo " dev : Run the application in development mode" + @echo " api-docs : Generate API documentation" + @echo " ingest : Ingest data using specified script" + @echo " wipe : Wipe data using specified script" + @echo " setup : Setup the application" diff --git a/scripts/ingest_folder.py b/scripts/ingest_folder.py index fc1740a27..8c6acad1c 100755 --- a/scripts/ingest_folder.py +++ b/scripts/ingest_folder.py @@ -20,20 +20,20 @@ def __init__(self, ingest_service: IngestService) -> None: self._files_under_root_folder: list[Path] = list() - def _find_all_files_in_folder(self, root_path: Path) -> None: + def _find_all_files_in_folder(self, root_path: Path, ignored: list[str]) -> None: """Search all files under the root folder recursively. Count them at the same time """ for file_path in root_path.iterdir(): - if file_path.is_file(): + if file_path.is_file() and file_path.name not in ignored: self.total_documents += 1 self._files_under_root_folder.append(file_path) - elif file_path.is_dir(): - self._find_all_files_in_folder(file_path) + elif file_path.is_dir() and file_path.name not in ignored: + self._find_all_files_in_folder(file_path, ignored) - def ingest_folder(self, folder_path: Path) -> None: + def ingest_folder(self, folder_path: Path, ignored: list[str]) -> None: # Count total documents before ingestion - self._find_all_files_in_folder(folder_path) + self._find_all_files_in_folder(folder_path, ignored) self._ingest_all(self._files_under_root_folder) def _ingest_all(self, files_to_ingest: list[Path]) -> None: @@ -64,12 +64,19 @@ def _do_ingest_one(self, changed_path: Path) -> None: action=argparse.BooleanOptionalAction, default=False, ) +parser.add_argument( + "--ignored", + nargs="*", + help="List of files/directories to ignore", + default=[], +) parser.add_argument( "--log-file", help="Optional path to a log file. If provided, logs will be written to this file.", type=str, default=None, ) + args = parser.parse_args() # Set up logging to a file if a path is provided @@ -91,9 +98,17 @@ def _do_ingest_one(self, changed_path: Path) -> None: ingest_service = global_injector.get(IngestService) worker = LocalIngestWorker(ingest_service) - worker.ingest_folder(root_path) + worker.ingest_folder(root_path, args.ignored) + + if args.ignored: + logger.info(f"Skipping following files and directories: {args.ignored}") if args.watch: logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...") + directories_to_watch = [ + dir + for dir in root_path.iterdir() + if dir.is_dir() and dir.name not in args.ignored + ] watcher = IngestWatcher(args.folder, worker.ingest_on_watch) watcher.start()