Fixing issue in python code snippets formatting script: (#381)

* Fixing issue in python code snippets formatting script: Fix resursive files lookup fix issue with jupyter-notebook specific commands * Update workflow to commit reformated code snippets * manage invalid syntax exception during black formatting * Fix pipeline * Fix pipeline * Fix pipeline * Format Python snippets in MDX files --------- Co-authored-by: Max Shkutnyk <[email protected]> Co-authored-by: GitHub Action <[email protected]>
cohere-ai · Jan 29, 2025 · 76dc7d8 · 76dc7d8
1 parent b39227d
commit 76dc7d8
Show file tree

Hide file tree

Showing 10 changed files with 95 additions and 66 deletions.
diff --git a/.github/scripts/check_python_code_snippets.py b/.github/scripts/check_python_code_snippets.py
@@ -7,45 +7,69 @@
 BASE_DIR = Path(__file__).resolve().parent
 MDX_DIR = BASE_DIR / "../../fern/pages"
 FILE_PATTERN = re.compile(r"\.mdx$")
+EXCLUDE_DIRS = ["cookbooks"]  # Add directory names to exclude
 
 
-def find_files_by_pattern(directory, pattern):
+def find_files_by_pattern(directory, pattern, exclude_dirs=None):
     """
     Finds all files in the given directory that match the provided regex pattern.
+    Skips directories listed in exclude_dirs.
     """
+    exclude_dirs = exclude_dirs or []
     directory = Path(directory).resolve()
+
     if not directory.is_dir():
         raise ValueError(f"Provided directory {directory} is not valid.")
-    return [f for f in directory.rglob('*') if f.is_file() and pattern.search(f.name)]
+
+    matching_files = []
+    for root, dirs, files in os.walk(directory):
+        # Remove excluded directories from the walk
+        dirs[:] = [d for d in dirs if d not in exclude_dirs]
+
+        for file_name in files:
+            file_path = Path(root) / file_name
+            if pattern.search(file_name):
+                matching_files.append(file_path)
+
+    return matching_files
 
 
 def format_python_snippets_in_mdx(file_path, line_length=DEFAULT_LINE_LENGTH):
     """
     Formats Python code snippets inside MDX files using Black.
     """
     black_mode = black.FileMode(line_length=line_length)
-    code_block_pattern = re.compile(r"```python\n(.*?)\n```", re.DOTALL)
+    code_block_pattern = re.compile(r"(`{3,4})(python|python PYTHON)\n(.*?)\n\1", re.DOTALL)
 
     with open(file_path, 'r', encoding='utf-8') as file:
         original_content = file.read()
 
     def format_with_black(match):
-        code = match.group(1)
+        """
+        Formats the matched Python code block using Black
+        """
+        backtick_count = match.group(1)  # Preserve the backtick count (``` or ````)
+        block_label = match.group(2)  # Capture the label (python or python PYTHON)
+        code = match.group(3)
 
-        # Comment out lines starting with '!'
-        processed_code = re.sub(r"^\s*!(.*)", r"# TEMP_COMMENT !\1", code, flags=re.MULTILINE)
+        # Comment out lines starting with '!' or '%' for formatting
+        processed_code = re.sub(r"^\s*(!|%)(.*)", r"# TEMP_COMMENT_\1\2", code, flags=re.MULTILINE)
 
+        # Format the processed code using Black
         try:
-            # Format the code with Black
             formatted_code = black.format_str(processed_code, mode=black_mode)
         except black.NothingChanged:
-            # If Black doesn't change anything, use original
-            formatted_code = processed_code  
+            return match.group(0)  # Return the original block if nothing changed
+        except black.parsing.InvalidInput as e:
+            print(f"Error formatting Python code in {file_path}: {e}")
+            # Optionally return original unformatted code or handle differently
+            return match.group(0)
 
-        # Revert the commented lines starting with '!'
-        reverted_code = re.sub(r"^\s*# TEMP_COMMENT !(.*)", r"!\1", formatted_code, flags=re.MULTILINE)
+        # Revert the temporary comments back to their original form
+        reverted_code = re.sub(r"^\s*# TEMP_COMMENT_(!|%)(.*)", r"\1\2", formatted_code, flags=re.MULTILINE)
 
-        return f"```python\n{reverted_code.strip()}\n```"
+        # Return the fully formatted and reverted block
+        return f"{backtick_count}{block_label}\n{reverted_code.strip()}\n{backtick_count}"
 
     new_content = code_block_pattern.sub(format_with_black, original_content)
 
@@ -55,44 +79,30 @@ def format_with_black(match):
     return original_content, new_content
 
 
-def process_mdx_files(directory, file_pattern, line_length=DEFAULT_LINE_LENGTH, check_changes=False):
+def process_mdx_files(directory, file_pattern, exclude_dirs=None, line_length=DEFAULT_LINE_LENGTH):
     """
     Processes all MDX files in the directory, formatting Python code snippets.
-
-    Args:
-        directory (Path or str): Path to the directory containing MDX files.
-        file_pattern (re.Pattern): Regex pattern to match MDX files.
-        line_length (int): Line length to use for Black formatting.
-        check_changes (bool): If True, raises an exception if changes are detected.
     """
-    matching_files = find_files_by_pattern(directory, file_pattern)
-    files_changed = []
+    matching_files = find_files_by_pattern(directory, file_pattern, exclude_dirs)
 
     for file_path in matching_files:
         original_content, new_content = format_python_snippets_in_mdx(file_path, line_length)
 
         if original_content != new_content:
-            files_changed.append(file_path)
-
-    if check_changes and files_changed:
-        raise RuntimeError(
-            f"The following files were modified during the run:\n"
-            + "\n".join(str(file) for file in files_changed)
-        )
+            print(f"Formatted: {file_path}")
 
 
 if __name__ == "__main__":
     import sys
 
     path = sys.argv[1] if len(sys.argv) > 1 else MDX_DIR
     line_length = int(sys.argv[2]) if len(sys.argv) > 2 else DEFAULT_LINE_LENGTH
-    check_changes = os.getenv("CI") == "true"  # Set to True in CI pipeline
 
     if Path(path).is_dir():
-        process_mdx_files(path, FILE_PATTERN, line_length, check_changes)
+        process_mdx_files(path, FILE_PATTERN, EXCLUDE_DIRS, line_length)
     elif Path(path).is_file():
         if FILE_PATTERN.search(path):
-            process_mdx_files(Path(path).parent, FILE_PATTERN, line_length, check_changes)
+            process_mdx_files(Path(path).parent, FILE_PATTERN, EXCLUDE_DIRS, line_length)
         else:
             print("The specified file does not match the MDX pattern.")
     else:

diff --git a/.github/workflows/check-python-code-snippets.yml b/.github/workflows/check-python-code-snippets.yml
@@ -16,6 +16,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
 
       - name: Set up Python
         uses: actions/setup-python@v4
@@ -34,6 +36,18 @@ jobs:
 
       - name: Run Python MDX Snippet Formatter
         shell: bash
-        env:
-          CI: true
         run: poetry run python .github/scripts/check_python_code_snippets.py fern/pages
+
+      - name: Check for changes
+        id: diff
+        run: |
+          git diff --exit-code || echo "::set-output name=changes::true"
+
+      - name: Commit and Push Changes
+        if: steps.diff.outputs.changes == 'true'
+        run: |
+          git config --local user.email "[email protected]"
+          git config --local user.name "GitHub Action"
+          git add -u
+          git commit -m "Format Python snippets in MDX files"
+          git push
diff --git a/fern/pages/tutorials/build-things-with-cohere.mdx b/fern/pages/tutorials/build-things-with-cohere.mdx
@@ -39,7 +39,7 @@ Next, we'll import the `cohere` library and create a client to be used throughou
 import cohere
 
 # Get your API key here: https://dashboard.cohere.com/api-keys
-co = cohere.Client(api_key="YOUR_COHERE_API_KEY")  
+co = cohere.Client(api_key="YOUR_COHERE_API_KEY")
 ```
 
 # Accessing Cohere from Other Platforms

diff --git a/fern/pages/tutorials/build-things-with-cohere/building-a-chatbot-with-cohere.mdx b/fern/pages/tutorials/build-things-with-cohere/building-a-chatbot-with-cohere.mdx
@@ -31,7 +31,7 @@ To get started, first we need to install the `cohere` library and create a Coher
 import cohere
 
 # Get your API key: https://dashboard.cohere.com/api-keys
-co = cohere.Client("COHERE_API_KEY")  
+co = cohere.Client("COHERE_API_KEY")
 ```
 
 ## Creating a custom preamble

diff --git a/fern/pages/tutorials/build-things-with-cohere/building-an-agent-with-cohere.mdx b/fern/pages/tutorials/build-things-with-cohere/building-an-agent-with-cohere.mdx
@@ -36,7 +36,7 @@ import numpy as np
 import cohere
 
 # Get your API key: https://dashboard.cohere.com/api-keys
-co = cohere.Client("COHERE_API_KEY")  
+co = cohere.Client("COHERE_API_KEY")
 ```
 
 ## Creating tools

diff --git a/fern/pages/tutorials/build-things-with-cohere/rag-with-cohere.mdx b/fern/pages/tutorials/build-things-with-cohere/rag-with-cohere.mdx
@@ -38,7 +38,7 @@ import numpy as np
 import cohere
 
 # Get your API key: https://dashboard.cohere.com/api-keys
-co = cohere.Client("COHERE_API_KEY")  
+co = cohere.Client("COHERE_API_KEY")
 ```
 
 ## Basic RAG

diff --git a/fern/pages/tutorials/build-things-with-cohere/reranking-with-cohere.mdx b/fern/pages/tutorials/build-things-with-cohere/reranking-with-cohere.mdx
@@ -33,7 +33,7 @@ import numpy as np
 import cohere
 
 # Get your API key: https://dashboard.cohere.com/api-keys
-co = cohere.Client("COHERE_API_KEY")  
+co = cohere.Client("COHERE_API_KEY")
 ```
 
 ## Reranking lexical/semantic search results

diff --git a/fern/pages/tutorials/build-things-with-cohere/semantic-search-with-cohere.mdx b/fern/pages/tutorials/build-things-with-cohere/semantic-search-with-cohere.mdx
@@ -36,7 +36,7 @@ import numpy as np
 import cohere
 
 # Get your API key: https://dashboard.cohere.com/api-keys
-co = cohere.Client("COHERE_API_KEY")  
+co = cohere.Client("COHERE_API_KEY")
 ```
 
 ## Embedding the documents

diff --git a/fern/pages/tutorials/build-things-with-cohere/text-generation-tutorial.mdx b/fern/pages/tutorials/build-things-with-cohere/text-generation-tutorial.mdx
@@ -31,7 +31,7 @@ To get started, first we need to install the `cohere` library and create a Coher
 import cohere
 
 # Get your API key: https://dashboard.cohere.com/api-keys
-co = cohere.Client("COHERE_API_KEY")  
+co = cohere.Client("COHERE_API_KEY")
 ```
 
 ## Basic text generation

diff --git a/fern/pages/v2/text-generation/structured-outputs.mdx b/fern/pages/v2/text-generation/structured-outputs.mdx
@@ -141,46 +141,51 @@ In this schema, we defined three keys ("title," "author," "publication_year") an
 Here's an example of a nested array. Note that the top level json structure must always be a json object.
 
 ```python PYTHON
-cohere_api_key = os.getenv('cohere_api_key')
+cohere_api_key = os.getenv("cohere_api_key")
 co = cohere.ClientV2(cohere_api_key)
 response = co.chat(
     response_format={
-      "type": "json_object",
-      "schema": {
-        "type": "object",
-        "properties": {
-          "actions": {
-            "type": "array",          
-            "items": {
-              "type": "object",          
-              "properties": {
-                "japanese": {"type": "string"},
-                "romaji": {"type": "string"},
-                "english": {"type": "string"}
-              },
-              "required": ["japanese", "romaji", "english"]
+        "type": "json_object",
+        "schema": {
+            "type": "object",
+            "properties": {
+                "actions": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "japanese": {"type": "string"},
+                            "romaji": {"type": "string"},
+                            "english": {"type": "string"},
+                        },
+                        "required": ["japanese", "romaji", "english"],
+                    },
+                }
             },
-          }
+            "required": ["actions"],
         },
-        "required": ["actions"]
-      }
     },
     model="command-r",
     messages=[
-      {"role": "user", "content": "Generate a JSON array of objects with the following fields: japanese, romaji, english. These actions should be japanese verbs provided in the dictionary form.},
-    ]
-)  
+        {
+            "role": "user",
+            "content": "Generate a JSON array of objects with the following fields: japanese, romaji, english. These actions should be japanese verbs provided in the dictionary form.",
+        },
+    ],
+)
 return json.loads(response.message.content[0].text)
 ```
 
 The output for this example would be:
 
 ```json
-{"actions": [
-  {"japanese": "いこう", "romaji": "ikou", "english": "onward"},
-  {"japanese": "探す", "romaji": "sagasu", "english": "search"},
-  {"japanese": "話す", "romaji": "hanasu", "english": "talk"}
-]
+{
+    "actions": [
+        {"japanese": "いこう", "romaji": "ikou", "english": "onward"},
+        {"japanese": "探す", "romaji": "sagasu", "english": "search"},
+        {"japanese": "話す", "romaji": "hanasu", "english": "talk"}
+    ]
+}
 ```