diff --git a/README.md b/README.md index 71a6bfe..0209a79 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,14 @@ $ copy-spotter [-s] [-o] [-h] input_directory ***Positional Arguments:*** * `input_directory`: Directory that contains one folder per pdf file (see `data/pdf/plagiarism` for example) +``` +input_directory/ +│ +├── file_1.docx +├── file_2.pdf +└── file_3.pdf +``` + ***Optional Arguments:*** * `-s`, `--block-size`: Set minimum number of consecutive and similar words detected. (Default is 2) * `-o`, `--out_dir`: Set the output directory for html files. (Default is creating a new directory called results) @@ -72,8 +80,4 @@ $ python -m scripts.main [-s] [-o] [-h] input_directory --- - Add more tests on existing functions - Implement OCR with tesseract for scanned documents -- Add info in console for timing (tqdm) -- Add CSS to HTML Template to make the results better looking -- Add support for other folder structures (right now the package is expecting one pdf files per folder) -- Add custom naming option for pdf files -- Fix Slate3k by installing custom fork (check if still relevant) \ No newline at end of file +- Add custom naming option for pdf files \ No newline at end of file diff --git a/data/pdf/plagiarism/Axel Mare_report/report.txt b/data/pdf/plagiarism/Axel Mare_report/report.txt deleted file mode 100644 index 20a6c93..0000000 --- a/data/pdf/plagiarism/Axel Mare_report/report.txt +++ /dev/null @@ -1 +0,0 @@ -This is my report! I didn't not cheat... \ No newline at end of file diff --git a/data/pdf/plagiarism/John Doe_report/report_2.txt b/data/pdf/plagiarism/John Doe_report/report_2.txt deleted file mode 100644 index d83a600..0000000 --- a/data/pdf/plagiarism/John Doe_report/report_2.txt +++ /dev/null @@ -1 +0,0 @@ -This is my report! I didn't not cheat... Or did I ? \ No newline at end of file diff --git a/data/pdf/plagiarism/Lucas Pelipe_report/random_txt.txt b/data/pdf/plagiarism/Lucas Pelipe_report/random_txt.txt deleted file mode 100644 index 25e7183..0000000 --- a/data/pdf/plagiarism/Lucas Pelipe_report/random_txt.txt +++ /dev/null @@ -1 +0,0 @@ -This is a random text file that has nothing to do with this project. \ No newline at end of file diff --git a/data/pdf/plagiarism/Marie Pole_report/final_version.txt b/data/pdf/plagiarism/Marie Pole_report/final_version.txt deleted file mode 100644 index 6f69592..0000000 --- a/data/pdf/plagiarism/Marie Pole_report/final_version.txt +++ /dev/null @@ -1 +0,0 @@ -Binic Folks Blues Festival \ No newline at end of file diff --git a/scripts/main.py b/scripts/main.py index 6b5170d..a5861fb 100644 --- a/scripts/main.py +++ b/scripts/main.py @@ -18,7 +18,7 @@ from scripts.html_utils import writing_results from scripts.processing_files import file_extension_call from scripts.similarity import difflib_overlap -from scripts.utils import wait_for_file, get_student_names, parse_options +from scripts.utils import wait_for_file, parse_options class MinimumFilesError(Exception): @@ -62,7 +62,7 @@ def main() -> None: in_dir = path.abspath(in_dir) files = [ - f for f in listdir(in_dir) if path.isdir(path.join(in_dir, f)) or f.endswith(("txt", "pdf", "docx", "odt")) + f for f in listdir(in_dir) if path.isfile(path.join(in_dir, f)) and f.endswith(("txt", "pdf", "docx", "odt")) ] if len(files) < 2: @@ -71,19 +71,14 @@ def main() -> None: ) filenames, processed_files = [], [] - students_names = get_student_names(in_dir) - - for ind, direc in enumerate(tqdm(listdir(in_dir), desc="Processing Directories")): - if path.isdir(path.join(in_dir, direc)): - for file in listdir(path.join(in_dir, direc)): - file_words = file_extension_call(str(path.join(in_dir, direc, file))) - if file_words: # If all files have supported format - processed_files.append(file_words) - filenames.append(students_names[ind]) - else: - raise UnsupportedFileError( - "Remove files which are not txt, pdf, docx, or odt and run the script again." - ) + + for file in tqdm(files, desc="Processing Files"): + file_words = file_extension_call(str(path.join(in_dir, file))) + if file_words: # If all files have supported format + processed_files.append(file_words) + filenames.append(path.splitext(file)[0]) + else: + raise UnsupportedFileError("Remove files which are not txt, pdf, docx, or odt and run the script again.") if out_dir is not None and path.exists(out_dir): if not path.isabs(out_dir): diff --git a/scripts/processing_files.py b/scripts/processing_files.py index 02afd5d..455841d 100644 --- a/scripts/processing_files.py +++ b/scripts/processing_files.py @@ -13,14 +13,12 @@ def get_file_extension(filepath: str) -> str: """Return the file extension of the file at the specified path""" if not path.isfile(filepath): - print("Invalid file path") - return "" + raise ValueError(f"Invalid file path: {filepath}") try: return path.splitext(filepath)[1] except IndexError: - print("File extension error") - return "" + raise ValueError(f"File extension error for file: {filepath}") def file_extension_call(file: str) -> list: @@ -28,18 +26,16 @@ def file_extension_call(file: str) -> list: extension = get_file_extension(file) - if extension: - if extension == ".pdf": - return get_words_from_pdf_file(file) - if extension == ".docx": - return get_words_from_docx_file(file) - if extension == ".odt": - return get_words_from_odt_file(file) - if extension == ".txt": - return get_words_from_txt_file(file) - - print("File format is not supported. Please convert to pdf, docx, odt or txt") - return [] + if extension == ".pdf": + return get_words_from_pdf_file(file) + elif extension == ".docx": + return get_words_from_docx_file(file) + elif extension == ".odt": + return get_words_from_odt_file(file) + elif extension == ".txt": + return get_words_from_txt_file(file) + else: + raise ValueError(f"File format not supported for file: {file}. " f"Please convert to pdf, docx, odt, or txt") def get_words_from_pdf_file(pdf_path: str) -> list: