Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added option to recurse sub-directories #14

Merged
merged 1 commit into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# File Processing Script

This script processes files in a specified directory using an API, logs results in a local SQLite database, and provides options for retrying failed or pending files. It includes features for skipping specific files, generating reports, and running multiple API calls in parallel.
This script processes files recursively from a specified directory using an API, logs results in a local SQLite database, and provides options for retrying failed or pending files. It includes features for skipping specific files, generating reports, and running multiple API calls in parallel.

## Features

Expand Down Expand Up @@ -61,6 +61,7 @@ This will display detailed usage information.
- `-p`, `--parallel_call_count`: Number of parallel API calls (default: 10).
- `--csv_report`: Path to export the detailed report as a CSV file.
- `--db_path`: Path where the SQlite DB file is stored (default: './file_processing.db')
- `--recursive`: Recursively identify and process files from the input folder path (default: False)
- `--retry_failed`: Retry processing of failed files.
- `--retry_pending`: Retry processing of pending files by making new requests.
- `--skip_pending`: Skip processing of pending files.
Expand Down
56 changes: 35 additions & 21 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class Arguments:
input_folder_path: str = ""
db_path: str = ""
parallel_call_count: int = 5
recurse_input_folder: bool = False
retry_failed: bool = False
retry_pending: bool = False
skip_pending: bool = False
Expand Down Expand Up @@ -463,11 +464,15 @@ def process_file(


def load_folder(args: Arguments):
files = [
os.path.join(args.input_folder_path, f)
for f in os.listdir(args.input_folder_path)
if os.path.isfile(os.path.join(args.input_folder_path, f))
]
files = []
for root, _, filenames in os.walk(args.input_folder_path):
for f in filenames:
file_path = os.path.join(root, f)
if os.path.isfile(file_path):
files.append(file_path)
if not args.recurse_input_folder:
break
logger.debug(f"Loaded '{len(files)}' files from '{args.input_folder_path}': {files}")

with Manager() as manager, Pool(args.parallel_call_count) as executor:
success_count = manager.Value("i", 0) # Shared integer for success count
Expand Down Expand Up @@ -501,6 +506,24 @@ def load_folder(args: Arguments):
pbar.close()


def api_deployment_batch_run(args: Arguments):
logger.warning(f"Running with params: {args}")
init_db(args=args) # Initialize DB

load_folder(args=args)

print_summary(args=args) # Print summary at the end
if args.print_report:
print_report(args=args)
logger.warning(
"Elapsed time calculation of a file which was resumed"
" from pending state will not be correct"
)

if args.csv_report:
export_report_to_csv(args=args)


def main():
parser = argparse.ArgumentParser(description="Process files using Unstract's API deployment")
parser.add_argument(
Expand Down Expand Up @@ -564,6 +587,12 @@ def main():
type=str,
help='Path to export the detailed report as a CSV file',
)
parser.add_argument(
"--recursive",
dest="recurse_input_folder",
action="store_true",
help="Recursively identify and process files from the input folder path (default: False)",
)
parser.add_argument(
"--retry_failed",
dest="retry_failed",
Expand Down Expand Up @@ -625,22 +654,7 @@ def main():
ch.setFormatter(formatter)
logging.basicConfig(level=args.log_level, handlers=[ch])

logger.warning(f"Running with params: {args}")

init_db(args=args) # Initialize DB

load_folder(args=args)

print_summary(args=args) # Print summary at the end
if args.print_report:
print_report(args=args)
logger.warning(
"Elapsed time calculation of a file which was resumed"
" from pending state will not be correct"
)

if args.csv_report:
export_report_to_csv(args=args)
api_deployment_batch_run(args=args)


if __name__ == "__main__":
Expand Down
Loading