Skip to content

Commit

Permalink
feat: async semantic splitter noder parser (#17449)
Browse files Browse the repository at this point in the history
  • Loading branch information
mjrowsky authored Jan 9, 2025
1 parent a492b03 commit 8620869
Showing 1 changed file with 55 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,24 @@ def _parse_nodes(

return all_nodes

async def _aparse_nodes(
self,
nodes: Sequence[BaseNode],
show_progress: bool = False,
**kwargs: Any,
) -> List[BaseNode]:
"""Asynchronously parse document into nodes."""
all_nodes: List[BaseNode] = []
nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

for node in nodes_with_progress:
nodes = await self.abuild_semantic_nodes_from_documents(
[node], show_progress
)
all_nodes.extend(nodes)

return all_nodes

def build_semantic_nodes_from_documents(
self,
documents: Sequence[Document],
Expand Down Expand Up @@ -171,6 +189,43 @@ def build_semantic_nodes_from_documents(

return all_nodes

async def abuild_semantic_nodes_from_documents(
self,
documents: Sequence[Document],
show_progress: bool = False,
) -> List[BaseNode]:
"""Asynchronously build window nodes from documents."""
all_nodes: List[BaseNode] = []
for doc in documents:
text = doc.text
text_splits = self.sentence_splitter(text)

sentences = self._build_sentence_groups(text_splits)

combined_sentence_embeddings = (
await self.embed_model.aget_text_embedding_batch(
[s["combined_sentence"] for s in sentences],
show_progress=show_progress,
)
)

for i, embedding in enumerate(combined_sentence_embeddings):
sentences[i]["combined_sentence_embedding"] = embedding

distances = self._calculate_distances_between_sentence_groups(sentences)

chunks = self._build_node_chunks(sentences, distances)

nodes = build_nodes_from_splits(
chunks,
doc,
id_func=self.id_func,
)

all_nodes.extend(nodes)

return all_nodes

def _build_sentence_groups(
self, text_splits: List[str]
) -> List[SentenceCombination]:
Expand Down

0 comments on commit 8620869

Please sign in to comment.