diff --git a/clean-modular-code/activity-3/clean-code-activity-3.md b/clean-modular-code/activity-3/clean-code-activity-3.md index 24f3257..5148c85 100644 --- a/clean-modular-code/activity-3/clean-code-activity-3.md +++ b/clean-modular-code/activity-3/clean-code-activity-3.md @@ -18,7 +18,7 @@ kernelspec: * In [activity 1](../activity-1/clean-code-activity-1), you made your code cleaner and more usable using expressive variable names and docstrings to document the module. * In [activity 2](../activity-2/clean-code-activity-2), you made your code more DRY ("Don't Repeat Yourself") using documented functions and conditionals. -In this activity, you will build checks into your workflow to handle data processing "features". +In this activity, you will build checks into your workflow to handle data processing "features". +++ {"editable": true, "slideshow": {"slide_type": ""}} @@ -67,9 +67,7 @@ def clean_title(title): return title ``` -+++ {"editable": true, "slideshow": {"slide_type": ""}} - - +The function below raises an error with a custom error message. but you can still see the ```{code-cell} ipython3 --- @@ -86,7 +84,47 @@ def clean_title(title): try: return title[0] except IndexError as e: - raise IndexError(f"Oops! You provided a title in an unexpected format. I expected the title to be provided in a list and you provided a {type(title)}.") from e + raise IndexError(f"Oops! You provided a title in an unexpected format. " + f"I expected the title to be provided in a list and you provided " + f"a {type(title)}.") from e + +# Example usage: +title = "" +print(clean_title(title)) # This will raise an IndexError with the friendly message +``` + +```{code-cell} ipython3 +# This is the preferred way to catch an error +def clean_title(title): + """ + Attempts to return the first character of the title. + Raises the same error with a friendly message if the input is invalid. + """ + try: + return title[0] + except IndexError as e: + raise IndexError(f"Oops! You provided a title in an unexpected format. I expected the title to be provided in a list and you provided a {type(title)}.") + +# Example usage: +title = "" +print(clean_title(title)) # This will raise an IndexError with the friendly message +``` + +If you wish, you can shorten the amount of information returned in the exception by adding `from None` to your exception. This will look nicer to a user but you lose some information in the exception feedback. + +```{code-cell} ipython3 +# This is the preferred way to catch an error +def clean_title(title): + """ + Attempts to return the first character of the title. + Raises the same error with a friendly message if the input is invalid. + """ + try: + return title[0] + except IndexError as e: + raise IndexError(f"Oops! You provided a title in an unexpected format. " + f"I expected the title to be provided in a list and you provided " + f"a {type(title)}.") from None # Example usage: title = "" @@ -138,7 +176,93 @@ Important: It is ok if you can't get the code to run fully by the end of this wo 1. identify at least one of the data processing "bugs" (even if you can't fix it) and/or 2. fix at least one bug -You can consider your effort today as a success! +You can consider your effort today as a success! We will work on the first element together as a group. + +```{code-cell} ipython3 +import json +from pathlib import Path + +import pandas as pd + + +def load_clean_json(file_path, columns_to_keep): + """ + Load JSON data from a file. Drop unnecessary columns and normalize + to DataFrame. + + Parameters + ---------- + file_path : Path + Path to the JSON file. + columns_to_keep : list + List of columns to keep in the DataFrame. + + Returns + ------- + dict + Loaded JSON data. + """ + + with file_path.open("r") as json_file: + json_data = json.load(json_file) + normalized_data = pd.json_normalize(json_data) + + return normalized_data.filter(items=columns_to_keep) + + +def format_date(date_parts: list) -> str: + """ + Format date parts into a string. + + Parameters + ---------- + date_parts : list + List containing year, month, and day. + + Returns + ------- + pd.datetime + A date formatted as a pd.datetime object. + """ + date_str = ( + f"{date_parts[0][0]}-{date_parts[0][1]:02d}-{date_parts[0][2]:02d}" + ) + return pd.to_datetime(date_str, format="%Y-%m-%d") + + +def clean_title(value): + """A function that removes a value contained in a list.""" + print("hi", value) + return value[0] + + +columns_to_keep = [ + "publisher", + "DOI", + "type", + "author", + "is-referenced-by-count", + "title", + "published.date-parts", +] + +data_dir = Path("data") + +all_papers_list = [] +for json_file in data_dir.glob("*.json"): + papers_df = load_clean_json(json_file, columns_to_keep) + + papers_df["title"] = papers_df["title"].apply(clean_title) + papers_df["published_date"] = papers_df["published.date-parts"].apply( + format_date + ) + + all_papers_list.append(papers_df) + +all_papers_df = pd.concat(all_papers_list, axis=0, ignore_index=True) + +print("Final shape of combined DataFrame:", all_papers_df.shape) +``` +++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": ["raises-exception"]} @@ -235,7 +359,7 @@ print("Final shape of combined DataFrame:", all_papers_df.shape) What I don't like about this is that the file not found error isn't too bad to figure out. things like keyerrors and value errors are more amorphous. so this could be an OYO 2 -and we could work through a index error or a key error instead? Becuase in that case they may want to return a default value or something else.... +and we could work through a index error or a key error instead? Becuase in that case they may want to return a default value or something else.... +++ {"editable": true, "slideshow": {"slide_type": ""}} @@ -314,7 +438,7 @@ Your goal is to troubleshoot any issues associated with cleaning up the title so +++ {"editable": true, "slideshow": {"slide_type": ""}, "tags": ["hide-cell"]} -Note: we can have two groups - one that wants to work on their own and another that wants to work with the instructor together. +Note: we can have two groups - one that wants to work on their own and another that wants to work with the instructor together. +++ {"editable": true, "slideshow": {"slide_type": ""}} diff --git a/clean-modular-code/checks-conditionals/python-function-checks.md b/clean-modular-code/checks-conditionals/python-function-checks.md index 85a4502..37a2378 100644 --- a/clean-modular-code/checks-conditionals/python-function-checks.md +++ b/clean-modular-code/checks-conditionals/python-function-checks.md @@ -30,7 +30,188 @@ Your goal is to identify detect and data processing or workflow problems immedia them to propagate through your code. This approach saves time and makes debugging easier, providing clearer, more useful error outputs (known as stack traces). -When working with messy data, you'll often encounter edge cases - unusual or unexpected data that can break your processing pipeline. Functions allow you to implement robust error handling and data validation. Here are some techniques you can use +When working with messy data, you'll often encounter edge cases - unusual or unexpected data that can break your processing pipeline. Functions allow you to implement robust error handling and data validation. + +(fail-fast)= + +## Fail fast strategy + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +# Open a file (but it doesn't exist +def read_file(file_path): + with open(file_path, 'r') as file: + data = file.read() + return data + +file_data = read_file("nonexistent_file.txt") +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +In the example below, you use a [conditional statement](python-conditionals) to check to see if the file exists; if it doesn't, then it returns None. In this case the code will fail quietly and the user doesn't understand that there is an error. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +--- +import os + +def read_file(file_path): + if os.path.exists(file_path): + with open(file_path, 'r') as file: + data = file.read() + return data + else: + return None # Doesn't fail immediately, just returns None + +# No error raised, even though the file doesn't exist +file_data = read_file("nonexistent_file.txt") +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +This code example below is better than the examples above for three reasons: + +1. It's pythonic: it asks for forgiveness later by using a try/except +2. it fails quickly - as soon as it tries to open the file. The code won't continue to run after this step fails. +3. it raises a clean, useful error that the user can undersatnd + +The code anticipates what will happen if it can't find the file. Here you can raise a `FileNotFoundError` and provide a useful message to the user. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +def read_file(file_path): + try: + with open(file_path, 'r') as file: + data = file.read() + return data + except FileNotFoundError: + raise FileNotFoundError(f"Oops! I couldn't find the file located at: {file_path}. Please check to see if it exists") + +# Raises an error immediately if the file doesn't exist +file_data = read_file("nonexistent_file.txt") +``` + +Notice that the code below doesn't throw an error. This code is allowed to fail quietly. +This will be problematic for a user as their code will run but they will have no output and will have to carefully debug the code in order to figure out where the problem is. + +```{code-cell} ipython3 +from pathlib import Path + +def read_file(file_path): + """ + Reads the content of a single file and returns it. + Handles errors if the file cannot be read. + """ + try: + with open(file_path, 'r') as file: + data = file.read() + return data + except FileNotFoundError: + return f"File not found: {file_path}" + except Exception as e: + return f"An error occurred while reading {file_path}: {e}" + +# Example usage +directory_path = Path("data") +files = list(directory_path.glob("*.json")) +for file in files: + file_data = read_file(file) + print(f"Content of {file.name}:") + print(file_data) +print("All done--I didn't fail but I should have failed quickly") +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +[Using functions](python-functions) is a great first step in creating a robust +and maintainable data processing workflow. Functions provide modular units that +can be tested independently, allowing you to handle various edge cases and +unexpected scenarios effectively. However, adding checks to your functions +is the next step towards making your code more robust and maintainable over time. + +(pythonic-checks)= + +## Make Checks Pythonic + +Python has a unique philosophy regarding handling potential errors or +exceptional cases. This philosophy is often summarized by the acronym EAFP: +"Easier to Ask for Forgiveness than Permission." When combined with the **fail +fast** approach, your code can be flexible and resilient to the messy +realities of data processing. + +### EAFP vs. LBYL + +There are two main approaches to handling potential errors: + +- **LBYL (Look Before You Leap)**: Check for conditions before making calls or + accessing data. +- **EAFP (Easier to Ask for Forgiveness than Permission)**: Assume the operation + will succeed and handle any exceptions if they occur. + +Pythonic code generally favors the EAFP approach, which allows for **failing +fast** when an error occurs, providing useful feedback without unnecessary +checks. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +--- +# LBYL approach - manually check that the user provides a int +def convert_to_int(value): + if isinstance(value, int): + return int(value) + else: + print("Oops i can't process this so I will fail gracefully.") + return None + +convert_to_int(1) +convert_to_int("a") +``` + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +--- +# EAFP approach - Consider what the user might provide and catch the error. +def convert_to_int(value): + try: + return int(value) + except ValueError: + print("Oops i can't process this so I will fail gracefully.") + return None # or some default value + +convert_to_int(1) +convert_to_int("a") +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +The EAFP (Easier to Ask for Forgiveness than Permission) approach is more Pythonic because: + +- It’s often faster, avoiding redundant checks when operations succeed. +- It’s more readable, separating the intended operation and error handling. + +## Any Check is a Good Check + +As long as you consider edge cases, you're writing great code! You don’t need to worry about being “Pythonic” immediately, but understanding both approaches is useful regardless of which approach you chose. +++ {"editable": true, "slideshow": {"slide_type": ""}} @@ -57,7 +238,7 @@ def convert_to_int(value): try: return int(value) except ValueError: - print("Oops i can't process this so I will fail gracefully.") + print("Oops i can't process this so I will fail quietly with a print statement.") return None # or some default value ``` @@ -235,6 +416,70 @@ my_string = "Hello" my_string.nonexistent_method() ``` ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +## FileNotFoundError + +A `FileNotFoundError` occurs in Python when the code attempts to open or access a file that does not exist at the specified path. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +with open("data/nonexistent_file.json", "r") as file: + data = file.read() +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +By catching this exception, you can + +1. Raise a kinder and more informative message. +2. Direct the user toward the next steps +3. FUTURE: write tests for this step of the workflow (if you are creating a package!) that make sure that it handles a bad file path properly. + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +tags: [raises-exception] +--- +from pathlib import Path + +file_path = Path("data") / "nonexistent_file.json" +try: + with open(file_path, "r") as file: + data = file.read() +except FileNotFoundError as fe: + raise FileNotFoundError(f"Oops! it looks like you provided a path to a file that doesn't exist. You provided: {file_path}. Make sure the file path exists. ") +``` + ++++ {"editable": true, "slideshow": {"slide_type": ""}} + +If you don't raise the error but instead provide a print statement, you can provide a simple, clean output without the full "stack" or set of Python messages that provides the full "tracking" or traceback of where the error originated. + +The challenge with not raising a FileNotFound error is that it will be a bit trickier to test the output. + +- you could do `sys.exit` too... bbut i've ru into issues with that in the past (i wish i could remember what they were ) . + +```{code-cell} ipython3 +--- +editable: true +slideshow: + slide_type: '' +--- +file_path = Path("data") / "nonexistent_file.json" +try: + with open(file_path, "r") as file: + data = file.read() +except FileNotFoundError as fe: + print(f"Oops! it looks like you provided a path to a file that doesn't exist. You provided: {file_path}. Make sure the file path exists. ") +``` + ```{code-cell} ipython3 --- editable: true