Skip to content

Commit

Permalink
move burden of checking document filetype to chunking.py
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Sep 24, 2024
1 parent 72f1c95 commit 43d9414
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 10 deletions.
22 changes: 14 additions & 8 deletions src/instructlab/sdg/utils/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,24 @@ def chunk_document(documents: List | str, server_ctx_size, chunk_word_count) ->
chunk_overlap = _DEFAULT_CHUNK_OVERLAP

# Using Markdown as default, document-specific chunking will be implemented in separate pr.
text_splitter = RecursiveCharacterTextSplitter.from_language(
md_text_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.MARKDOWN,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)

# Determine file type for heuristics, default with markdown
for docs in documents:
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
docs = re.sub(r"-{2,}\|", "-|", docs)
# Remove unnecessary spaces in front of pipe characters in a markdown table.
docs = re.sub(r"\ +\|", " |", docs)
temp = text_splitter.create_documents([docs])
content.extend([item.page_content for item in temp])
for doc in documents:
filetype = doc.rsplit(".")
if filetype == "md":
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
doc = re.sub(r"-{2,}\|", "-|", doc)
# Remove unnecessary spaces in front of pipe characters in a markdown table.
doc = re.sub(r"\ +\|", " |", doc)
temp = md_text_splitter.create_documents([doc])
content.extend([item.page_content for item in temp])
elif filetype == "pdf":
pass
else:
raise ValueError(f"Received document of type .{filetype}, which is not a supported filetype")
return content
3 changes: 1 addition & 2 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,10 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):

def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
samples = []
documents = [ensure_markdown(d) for d in leaf_node[0]["document"]]
# document is the same for the whole leaf node
chunks = (
chunking.chunk_document(
documents=documents,
documents=leaf_node[0]["document"],
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
)
Expand Down

0 comments on commit 43d9414

Please sign in to comment.