Skip to content

Commit

Permalink
Fixing issue in python code snippets formatting script: (#381)
Browse files Browse the repository at this point in the history
* Fixing issue in python code snippets formatting script:
Fix resursive files lookup
fix issue with jupyter-notebook specific commands

* Update workflow to commit reformated code snippets

* manage invalid syntax exception during black formatting

* Fix pipeline

* Fix pipeline

* Fix pipeline

* Format Python snippets in MDX files

---------

Co-authored-by: Max Shkutnyk <[email protected]>
Co-authored-by: GitHub Action <[email protected]>
  • Loading branch information
3 people authored Jan 29, 2025
1 parent b39227d commit 76dc7d8
Show file tree
Hide file tree
Showing 10 changed files with 95 additions and 66 deletions.
72 changes: 41 additions & 31 deletions .github/scripts/check_python_code_snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,45 +7,69 @@
BASE_DIR = Path(__file__).resolve().parent
MDX_DIR = BASE_DIR / "../../fern/pages"
FILE_PATTERN = re.compile(r"\.mdx$")
EXCLUDE_DIRS = ["cookbooks"] # Add directory names to exclude


def find_files_by_pattern(directory, pattern):
def find_files_by_pattern(directory, pattern, exclude_dirs=None):
"""
Finds all files in the given directory that match the provided regex pattern.
Skips directories listed in exclude_dirs.
"""
exclude_dirs = exclude_dirs or []
directory = Path(directory).resolve()

if not directory.is_dir():
raise ValueError(f"Provided directory {directory} is not valid.")
return [f for f in directory.rglob('*') if f.is_file() and pattern.search(f.name)]

matching_files = []
for root, dirs, files in os.walk(directory):
# Remove excluded directories from the walk
dirs[:] = [d for d in dirs if d not in exclude_dirs]

for file_name in files:
file_path = Path(root) / file_name
if pattern.search(file_name):
matching_files.append(file_path)

return matching_files


def format_python_snippets_in_mdx(file_path, line_length=DEFAULT_LINE_LENGTH):
"""
Formats Python code snippets inside MDX files using Black.
"""
black_mode = black.FileMode(line_length=line_length)
code_block_pattern = re.compile(r"```python\n(.*?)\n```", re.DOTALL)
code_block_pattern = re.compile(r"(`{3,4})(python|python PYTHON)\n(.*?)\n\1", re.DOTALL)

with open(file_path, 'r', encoding='utf-8') as file:
original_content = file.read()

def format_with_black(match):
code = match.group(1)
"""
Formats the matched Python code block using Black
"""
backtick_count = match.group(1) # Preserve the backtick count (``` or ````)
block_label = match.group(2) # Capture the label (python or python PYTHON)
code = match.group(3)

# Comment out lines starting with '!'
processed_code = re.sub(r"^\s*!(.*)", r"# TEMP_COMMENT !\1", code, flags=re.MULTILINE)
# Comment out lines starting with '!' or '%' for formatting
processed_code = re.sub(r"^\s*(!|%)(.*)", r"# TEMP_COMMENT_\1\2", code, flags=re.MULTILINE)

# Format the processed code using Black
try:
# Format the code with Black
formatted_code = black.format_str(processed_code, mode=black_mode)
except black.NothingChanged:
# If Black doesn't change anything, use original
formatted_code = processed_code
return match.group(0) # Return the original block if nothing changed
except black.parsing.InvalidInput as e:
print(f"Error formatting Python code in {file_path}: {e}")
# Optionally return original unformatted code or handle differently
return match.group(0)

# Revert the commented lines starting with '!'
reverted_code = re.sub(r"^\s*# TEMP_COMMENT !(.*)", r"!\1", formatted_code, flags=re.MULTILINE)
# Revert the temporary comments back to their original form
reverted_code = re.sub(r"^\s*# TEMP_COMMENT_(!|%)(.*)", r"\1\2", formatted_code, flags=re.MULTILINE)

return f"```python\n{reverted_code.strip()}\n```"
# Return the fully formatted and reverted block
return f"{backtick_count}{block_label}\n{reverted_code.strip()}\n{backtick_count}"

new_content = code_block_pattern.sub(format_with_black, original_content)

Expand All @@ -55,44 +79,30 @@ def format_with_black(match):
return original_content, new_content


def process_mdx_files(directory, file_pattern, line_length=DEFAULT_LINE_LENGTH, check_changes=False):
def process_mdx_files(directory, file_pattern, exclude_dirs=None, line_length=DEFAULT_LINE_LENGTH):
"""
Processes all MDX files in the directory, formatting Python code snippets.
Args:
directory (Path or str): Path to the directory containing MDX files.
file_pattern (re.Pattern): Regex pattern to match MDX files.
line_length (int): Line length to use for Black formatting.
check_changes (bool): If True, raises an exception if changes are detected.
"""
matching_files = find_files_by_pattern(directory, file_pattern)
files_changed = []
matching_files = find_files_by_pattern(directory, file_pattern, exclude_dirs)

for file_path in matching_files:
original_content, new_content = format_python_snippets_in_mdx(file_path, line_length)

if original_content != new_content:
files_changed.append(file_path)

if check_changes and files_changed:
raise RuntimeError(
f"The following files were modified during the run:\n"
+ "\n".join(str(file) for file in files_changed)
)
print(f"Formatted: {file_path}")


if __name__ == "__main__":
import sys

path = sys.argv[1] if len(sys.argv) > 1 else MDX_DIR
line_length = int(sys.argv[2]) if len(sys.argv) > 2 else DEFAULT_LINE_LENGTH
check_changes = os.getenv("CI") == "true" # Set to True in CI pipeline

if Path(path).is_dir():
process_mdx_files(path, FILE_PATTERN, line_length, check_changes)
process_mdx_files(path, FILE_PATTERN, EXCLUDE_DIRS, line_length)
elif Path(path).is_file():
if FILE_PATTERN.search(path):
process_mdx_files(Path(path).parent, FILE_PATTERN, line_length, check_changes)
process_mdx_files(Path(path).parent, FILE_PATTERN, EXCLUDE_DIRS, line_length)
else:
print("The specified file does not match the MDX pattern.")
else:
Expand Down
18 changes: 16 additions & 2 deletions .github/workflows/check-python-code-snippets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
ref: ${{ github.head_ref }}

- name: Set up Python
uses: actions/setup-python@v4
Expand All @@ -34,6 +36,18 @@ jobs:
- name: Run Python MDX Snippet Formatter
shell: bash
env:
CI: true
run: poetry run python .github/scripts/check_python_code_snippets.py fern/pages

- name: Check for changes
id: diff
run: |
git diff --exit-code || echo "::set-output name=changes::true"
- name: Commit and Push Changes
if: steps.diff.outputs.changes == 'true'
run: |
git config --local user.email "[email protected]"
git config --local user.name "GitHub Action"
git add -u
git commit -m "Format Python snippets in MDX files"
git push
2 changes: 1 addition & 1 deletion fern/pages/tutorials/build-things-with-cohere.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Next, we'll import the `cohere` library and create a client to be used throughou
import cohere

# Get your API key here: https://dashboard.cohere.com/api-keys
co = cohere.Client(api_key="YOUR_COHERE_API_KEY")
co = cohere.Client(api_key="YOUR_COHERE_API_KEY")
```

# Accessing Cohere from Other Platforms
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ To get started, first we need to install the `cohere` library and create a Coher
import cohere

# Get your API key: https://dashboard.cohere.com/api-keys
co = cohere.Client("COHERE_API_KEY")
co = cohere.Client("COHERE_API_KEY")
```

## Creating a custom preamble
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ import numpy as np
import cohere

# Get your API key: https://dashboard.cohere.com/api-keys
co = cohere.Client("COHERE_API_KEY")
co = cohere.Client("COHERE_API_KEY")
```

## Creating tools
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ import numpy as np
import cohere

# Get your API key: https://dashboard.cohere.com/api-keys
co = cohere.Client("COHERE_API_KEY")
co = cohere.Client("COHERE_API_KEY")
```

## Basic RAG
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import numpy as np
import cohere

# Get your API key: https://dashboard.cohere.com/api-keys
co = cohere.Client("COHERE_API_KEY")
co = cohere.Client("COHERE_API_KEY")
```

## Reranking lexical/semantic search results
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ import numpy as np
import cohere

# Get your API key: https://dashboard.cohere.com/api-keys
co = cohere.Client("COHERE_API_KEY")
co = cohere.Client("COHERE_API_KEY")
```

## Embedding the documents
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ To get started, first we need to install the `cohere` library and create a Coher
import cohere

# Get your API key: https://dashboard.cohere.com/api-keys
co = cohere.Client("COHERE_API_KEY")
co = cohere.Client("COHERE_API_KEY")
```

## Basic text generation
Expand Down
57 changes: 31 additions & 26 deletions fern/pages/v2/text-generation/structured-outputs.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -141,46 +141,51 @@ In this schema, we defined three keys ("title," "author," "publication_year") an
Here's an example of a nested array. Note that the top level json structure must always be a json object.

```python PYTHON
cohere_api_key = os.getenv('cohere_api_key')
cohere_api_key = os.getenv("cohere_api_key")
co = cohere.ClientV2(cohere_api_key)
response = co.chat(
response_format={
"type": "json_object",
"schema": {
"type": "object",
"properties": {
"actions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"japanese": {"type": "string"},
"romaji": {"type": "string"},
"english": {"type": "string"}
},
"required": ["japanese", "romaji", "english"]
"type": "json_object",
"schema": {
"type": "object",
"properties": {
"actions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"japanese": {"type": "string"},
"romaji": {"type": "string"},
"english": {"type": "string"},
},
"required": ["japanese", "romaji", "english"],
},
}
},
}
"required": ["actions"],
},
"required": ["actions"]
}
},
model="command-r",
messages=[
{"role": "user", "content": "Generate a JSON array of objects with the following fields: japanese, romaji, english. These actions should be japanese verbs provided in the dictionary form.},
]
)
{
"role": "user",
"content": "Generate a JSON array of objects with the following fields: japanese, romaji, english. These actions should be japanese verbs provided in the dictionary form.",
},
],
)
return json.loads(response.message.content[0].text)
```

The output for this example would be:

```json
{"actions": [
{"japanese": "いこう", "romaji": "ikou", "english": "onward"},
{"japanese": "探す", "romaji": "sagasu", "english": "search"},
{"japanese": "話す", "romaji": "hanasu", "english": "talk"}
]
{
"actions": [
{"japanese": "いこう", "romaji": "ikou", "english": "onward"},
{"japanese": "探す", "romaji": "sagasu", "english": "search"},
{"japanese": "話す", "romaji": "hanasu", "english": "talk"}
]
}
```


Expand Down

0 comments on commit 76dc7d8

Please sign in to comment.