move burden of checking document filetype to chunking.py

Signed-off-by: Khaled Sulayman <[email protected]>
instructlab · Sep 24, 2024 · 43d9414 · 43d9414
1 parent 72f1c95
commit 43d9414
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 10 deletions.
diff --git a/src/instructlab/sdg/utils/chunking.py b/src/instructlab/sdg/utils/chunking.py
@@ -58,18 +58,24 @@ def chunk_document(documents: List | str, server_ctx_size, chunk_word_count) ->
     chunk_overlap = _DEFAULT_CHUNK_OVERLAP
 
     # Using Markdown as default, document-specific chunking will be implemented in separate pr.
-    text_splitter = RecursiveCharacterTextSplitter.from_language(
+    md_text_splitter = RecursiveCharacterTextSplitter.from_language(
         language=Language.MARKDOWN,
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap,
     )
 
     # Determine file type for heuristics, default with markdown
-    for docs in documents:
-        # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
-        docs = re.sub(r"-{2,}\|", "-|", docs)
-        # Remove unnecessary spaces in front of pipe characters in a markdown table.
-        docs = re.sub(r"\  +\|", " |", docs)
-        temp = text_splitter.create_documents([docs])
-        content.extend([item.page_content for item in temp])
+    for doc in documents:
+        filetype = doc.rsplit(".")
+        if filetype == "md":
+            # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
+            doc = re.sub(r"-{2,}\|", "-|", doc)
+            # Remove unnecessary spaces in front of pipe characters in a markdown table.
+            doc = re.sub(r"\  +\|", " |", doc)
+            temp = md_text_splitter.create_documents([doc])
+            content.extend([item.page_content for item in temp])
+        elif filetype == "pdf":
+            pass
+        else:
+            raise ValueError(f"Received document of type .{filetype}, which is not a supported filetype")
     return content
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -273,11 +273,10 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):
 
 def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
     samples = []
-    documents = [ensure_markdown(d) for d in leaf_node[0]["document"]]
     # document is the same for the whole leaf node
     chunks = (
         chunking.chunk_document(
-            documents=documents,
+            documents=leaf_node[0]["document"],
             server_ctx_size=server_ctx_size,
             chunk_word_count=chunk_word_count,
         )