Skip to content

Commit

Permalink
propogate file pattern to determine filetype
Browse files Browse the repository at this point in the history
  • Loading branch information
khaledsulayman committed Sep 26, 2024
1 parent 2397c66 commit bec5daf
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 6 deletions.
6 changes: 3 additions & 3 deletions src/instructlab/sdg/utils/chunking_tmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ def _extract_filetypes_from_docs(documents: List):
pdf_docs = []

for doc in documents:
filetype = doc.rsplit(".")
filetype = doc.rsplit(".")[-1]
if filetype == "md":
md_docs.append(doc)
elif filetype == "pdf":
Expand All @@ -560,8 +560,8 @@ def chunk_documents(
List[str]: List of chunked documents.
"""
print(f"THIS IS KHALED: in chunk_documents {leaf_node=}")
leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
documents = leaf_node[0]["document"]
leaf_node_path = leaf_node[0]["pattern"]
documents = leaf_node[0]["pattern"]
print(f"""THIS IS KHALED:
{leaf_node_path=},
{documents=},
Expand Down
8 changes: 5 additions & 3 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def _get_documents(
file_contents.append(pdf_text)

if file_contents:
return file_contents
return file_contents, file_patterns
raise SystemExit("Couldn't find knowledge documents")

except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
Expand Down Expand Up @@ -182,8 +182,9 @@ def _read_taxonomy_file(file_path: str | Path, yamllint_config: str | None = Non
task_description = contents.get("task_description", None)
domain = contents.get("domain")
documents = contents.get("document")
print(f"")

Check warning on line 185 in src/instructlab/sdg/utils/taxonomy.py

View workflow job for this annotation

GitHub Actions / pylint

W1309: Using an f-string that does not have any interpolated variables (f-string-without-interpolation)
if documents:
documents = _get_documents(source=documents)
document_contents, file_patterns = _get_documents(source=documents)
logger.debug("Content from git repo fetched")

for seed_example in contents.get("seed_examples"):
Expand All @@ -195,7 +196,8 @@ def _read_taxonomy_file(file_path: str | Path, yamllint_config: str | None = Non
"questions_and_answers": question_answer_list,
"context": context,
"taxonomy_path": tax_path,
"document": documents,
"document": document_contents,

Check failure on line 199 in src/instructlab/sdg/utils/taxonomy.py

View workflow job for this annotation

GitHub Actions / pylint

E0606: Possibly using variable 'document_contents' before assignment (possibly-used-before-assignment)
"pattern": file_patterns,

Check failure on line 200 in src/instructlab/sdg/utils/taxonomy.py

View workflow job for this annotation

GitHub Actions / pylint

E0606: Possibly using variable 'file_patterns' before assignment (possibly-used-before-assignment)
"domain": domain,
"document_outline": contents.get("document_outline"),
}
Expand Down

0 comments on commit bec5daf

Please sign in to comment.