Skip to content

Commit

Permalink
Update integrated vectorization
Browse files Browse the repository at this point in the history
  • Loading branch information
pamelafox committed Oct 16, 2024
1 parent e99f6e2 commit d4e40b8
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 24 deletions.
32 changes: 12 additions & 20 deletions app/backend/prepdocslib/integratedvectorizerstrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@
)
from azure.search.documents.indexes.models import (
AzureOpenAIEmbeddingSkill,
AzureOpenAIParameters,
AzureOpenAIVectorizer,
FieldMapping,
IndexProjectionMode,
InputFieldMappingEntry,
OutputFieldMappingEntry,
SearchIndexer,
SearchIndexerDataContainer,
SearchIndexerDataSourceConnection,
SearchIndexerIndexProjections,
SearchIndexerDataSourceType,
SearchIndexerIndexProjection,
SearchIndexerIndexProjectionSelector,
SearchIndexerIndexProjectionsParameters,
SearchIndexerSkillset,
Expand Down Expand Up @@ -67,6 +66,7 @@ async def create_embedding_skill(self, index_name: str):
skillset_name = f"{index_name}-skillset"

split_skill = SplitSkill(
name=f"{index_name}-split-skill",
description="Split skill to chunk documents",
text_split_mode="pages",
context="/document",
Expand All @@ -82,17 +82,20 @@ async def create_embedding_skill(self, index_name: str):
raise ValueError("Expecting Azure Open AI instance")

embedding_skill = AzureOpenAIEmbeddingSkill(
name=f"{index_name}-embedding-skill",
description="Skill to generate embeddings via Azure OpenAI",
context="/document/pages/*",
resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_id=self.embeddings.open_ai_deployment,
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_name=self.embeddings.open_ai_deployment,
model_name=self.embeddings.open_ai_model_name,
dimensions=self.embeddings.open_ai_dimensions,
inputs=[
InputFieldMappingEntry(name="text", source="/document/pages/*"),
],
outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
)

index_projections = SearchIndexerIndexProjections(
index_projection = SearchIndexerIndexProjection(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=index_name,
Expand All @@ -114,7 +117,7 @@ async def create_embedding_skill(self, index_name: str):
name=skillset_name,
description="Skillset to chunk documents and generate embeddings",
skills=[split_skill, embedding_skill],
index_projections=index_projections,
index_projection=index_projection,
)

return skillset
Expand All @@ -132,25 +135,14 @@ async def setup(self):
if self.embeddings is None:
raise ValueError("Expecting Azure Open AI instance")

await search_manager.create_index(
vectorizers=[
AzureOpenAIVectorizer(
name=f"{self.search_info.index_name}-vectorizer",
kind="azureOpenAI",
azure_open_ai_parameters=AzureOpenAIParameters(
resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_id=self.embeddings.open_ai_deployment,
),
),
]
)
await search_manager.create_index()

# create indexer client
ds_client = self.search_info.create_search_indexer_client()
ds_container = SearchIndexerDataContainer(name=self.blob_manager.container)
data_source_connection = SearchIndexerDataSourceConnection(
name=f"{self.search_info.index_name}-blob",
type="azureblob",
type=SearchIndexerDataSourceType.AZURE_BLOB,
connection_string=self.blob_manager.get_managedidentity_connectionstring(),
container=ds_container,
data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(),
Expand Down
15 changes: 13 additions & 2 deletions app/backend/prepdocslib/searchmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from typing import List, Optional

from azure.search.documents.indexes.models import (
AzureOpenAIVectorizer,
AzureOpenAIVectorizerParameters,
HnswAlgorithmConfiguration,
HnswParameters,
SearchableField,
Expand Down Expand Up @@ -174,12 +176,21 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
VectorSearchProfile(
name="embedding_config",
algorithm_configuration_name="hnsw_config",
vectorizer=(
vectorizer_name=(
f"{self.search_info.index_name}-vectorizer" if self.use_int_vectorization else None
),
),
],
vectorizers=vectorizers,
vectorizers=[
AzureOpenAIVectorizer(
vectorizer_name=f"{self.search_info.index_name}-vectorizer",
parameters=AzureOpenAIVectorizerParameters(
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_name=self.embeddings.open_ai_deployment,
model_name=self.embeddings.open_ai_model_name,
),
),
],
),
)
if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]:
Expand Down
2 changes: 1 addition & 1 deletion app/backend/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ tiktoken
tenacity
azure-ai-documentintelligence
azure-cognitiveservices-speech
azure-search-documents==11.6.0b5
azure-search-documents==11.6.0b6
azure-storage-blob
azure-storage-file-datalake
uvicorn
Expand Down
2 changes: 1 addition & 1 deletion app/backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ azure-monitor-opentelemetry==1.6.1
# via -r requirements.in
azure-monitor-opentelemetry-exporter==1.0.0b28
# via azure-monitor-opentelemetry
azure-search-documents==11.6.0b1
azure-search-documents==11.6.0b6
# via -r requirements.in
azure-storage-blob==12.22.0
# via
Expand Down

0 comments on commit d4e40b8

Please sign in to comment.