Skip to content

Commit

Permalink
Rename chat-with-index internal code to src and apply various black f…
Browse files Browse the repository at this point in the history
…ormatting fixes.
  • Loading branch information
Lucas Pickup committed Sep 7, 2023
1 parent 5317551 commit ceedc9c
Show file tree
Hide file tree
Showing 26 changed files with 98 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,19 @@
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
ml_client = MLClient.from_config(
credential=DefaultAzureCredential(), path="config.json"
)

# %% Create DataIndex configuration
from azureml.rag.dataindex.entities import Data, DataIndex, IndexSource, CitationRegex, Embedding, IndexStore
from azureml.rag.dataindex.entities import (
Data,
DataIndex,
IndexSource,
CitationRegex,
Embedding,
IndexStore,
)

asset_name = "azure_search_docs_aoai_faiss"

Expand All @@ -29,20 +38,17 @@
citation_url="https://learn.microsoft.com/en-us/azure",
# Remove articles from the final citation url and remove the file extension so url points to hosted docs, not GitHub.
citation_url_replacement_regex=CitationRegex(
match_pattern="(.*)/articles/(.*)(\\.[^.]+)$",
replacement_pattern="\\1/\\2"
)
match_pattern="(.*)/articles/(.*)(\\.[^.]+)$", replacement_pattern="\\1/\\2"
),
),
embedding=Embedding(
model="text-embedding-ada-002",
connection="azureml-rag-oai",
cache_path=f"azureml://datastores/workspaceblobstore/paths/embeddings_cache/{asset_name}",
),
index=IndexStore(
type="faiss"
),
index=IndexStore(type="faiss"),
# name is replaced with a unique value each time the job is run
path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}"
path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}",
)

# %% Use git_clone Component to clone Azure Search docs from github
Expand All @@ -54,17 +60,17 @@
from azure.ai.ml.dsl import pipeline
from azureml.rag.dataindex.data_index import index_data


@pipeline(default_compute="serverless")
def git_to_faiss(
git_url,
branch_name="",
git_connection_id="",
):
git_clone = git_clone_component(
git_repository=git_url,
branch_name=branch_name
)
git_clone.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_GIT"] = git_connection_id
git_clone = git_clone_component(git_repository=git_url, branch_name=branch_name)
git_clone.environment_variables[
"AZUREML_WORKSPACE_CONNECTION_ID_GIT"
] = git_connection_id

index_job = index_data(
description=data_index.description,
Expand All @@ -75,10 +81,11 @@ def git_to_faiss(

return index_job.outputs


# %%
git_index_job = git_to_faiss("https://github.com/MicrosoftDocs/azure-docs.git")
# Ensure repo cloned each run to get latest, comment out to have first clone reused.
git_index_job.settings.force_rerun = True
git_index_job.settings.force_rerun = True

# %% Submit the DataIndex Job
git_index_run = ml_client.jobs.create_or_update(
Expand All @@ -100,7 +107,7 @@ def git_to_faiss(
mlindex = MLIndex(mlindex_docs_index_asset)

index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search('How can I enable Semantic Search on my Index?', k=5)
docs = index.similarity_search("How can I enable Semantic Search on my Index?", k=5)
docs

# %% Take a look at those chunked docs
Expand Down Expand Up @@ -131,10 +138,7 @@ def git_to_faiss(
with open(flow_path / "flow.dag.yaml", "r") as f:
flow_yaml = f.read()
flow_yaml = re.sub(
r"path: (.*)# Index uri",
f"path: {mlindex_path} # Index uri",
flow_yaml,
re.M
r"path: (.*)# Index uri", f"path: {mlindex_path} # Index uri", flow_yaml, re.M
)
with open(flow_path / "flow.dag.yaml", "w") as f:
f.write(flow_yaml)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from azure.ai.ml import MLClient, load_data
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
ml_client = MLClient.from_config(
credential=DefaultAzureCredential(), path="config.json"
)

# %% Load DataIndex configuration from file
data_index = load_data("local_docs_to_acs_mlindex.yaml")
Expand All @@ -31,7 +33,7 @@
mlindex = MLIndex(mlindex_docs_index_asset)

index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search('What is an MLIndex?', k=5)
docs = index.similarity_search("What is an MLIndex?", k=5)
docs

# %% Take a look at those chunked docs
Expand All @@ -41,4 +43,3 @@
print(json.dumps({"content": doc.page_content, **doc.metadata}, indent=2))

# %% Try it out with Promptflow

Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,18 @@
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
ml_client = MLClient.from_config(
credential=DefaultAzureCredential(), path="config.json"
)

# %% Create DataIndex configuration
from azureml.rag.dataindex.entities import Data, DataIndex, IndexSource, Embedding, IndexStore
from azureml.rag.dataindex.entities import (
Data,
DataIndex,
IndexSource,
Embedding,
IndexStore,
)

asset_name = "s3_aoai_acs"

Expand All @@ -36,7 +44,7 @@
connection="azureml-rag-acs",
),
# name is replaced with a unique value each time the job is run
path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}"
path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}",
)

# %% Create the DataIndex Job to be scheduled
Expand All @@ -55,13 +63,13 @@
mlindex_docs_index_asset = ml_client.data.get(data_index.name, label="latest")
mlindex_docs_index_asset

## %% Try it out with langchain by loading the MLIndex asset using the azureml-rag SDK
# %% Try it out with langchain by loading the MLIndex asset using the azureml-rag SDK
from azureml.rag.mlindex import MLIndex

mlindex = MLIndex(mlindex_docs_index_asset)

index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search('What is RAG?', k=5)
docs = index.similarity_search("What is RAG?", k=5)
docs

# %% Take a look at those chunked docs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,18 @@
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
ml_client = MLClient.from_config(
credential=DefaultAzureCredential(), path="config.json"
)

# %% Create DataIndex configuration
from azureml.rag.dataindex.entities import Data, DataIndex, IndexSource, Embedding, IndexStore
from azureml.rag.dataindex.entities import (
Data,
DataIndex,
IndexSource,
Embedding,
IndexStore,
)

asset_name = "s3_aoai_acs"

Expand All @@ -36,7 +44,7 @@
connection="azureml-rag-acs",
),
# name is replaced with a unique value each time the job is run
path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}"
path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}",
)

# %% Create the DataIndex Job to be scheduled
Expand All @@ -47,7 +55,7 @@
# The DataIndex Job will use the identity of the MLClient within the DataIndex Job to access source data.
identity=UserIdentityConfiguration(),
# Instead of submitting the Job and returning the Run a PipelineJob configuration is returned which can be used in with a Schedule.
submit_job=False
submit_job=False,
)

# %% Create Schedule for DataIndex Job
Expand All @@ -61,13 +69,16 @@
recurrence_trigger = RecurrenceTrigger(
frequency="day",
interval=1,
#schedule=RecurrencePattern(hours=16, minutes=[15]),
# schedule=RecurrencePattern(hours=16, minutes=[15]),
start_time=schedule_start_time,
time_zone=TimeZone.UTC,
)

job_schedule = JobSchedule(
name=schedule_name, trigger=recurrence_trigger, create_job=index_job, properties=index_job.properties
name=schedule_name,
trigger=recurrence_trigger,
create_job=index_job,
properties=index_job.properties,
)

# %% Enable Schedule
Expand All @@ -89,7 +100,7 @@
mlindex = MLIndex(onelake_s3_index_asset)

index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search('What is RAG?', k=5)
docs = index.similarity_search("What is RAG?", k=5)
docs

# %% Take a look at those chunked docs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ def format_doc(doc: dict):

retrieved_docs = []
for item in search_result:

entity = SearchResultEntity.from_dict(item)
content = entity.text or ""

Expand All @@ -23,9 +22,6 @@ def format_doc(doc: dict):
if URL_KEY in entity.metadata[SOURCE_KEY]:
source = entity.metadata[SOURCE_KEY][URL_KEY] or ""

retrieved_docs.append({
"Content": content,
"Source": source
})
retrieved_docs.append({"Content": content, "Source": source})
doc_string = "\n\n".join([format_doc(doc) for doc in retrieved_docs])
return doc_string
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from promptflow import tool
from chat_with_index.main import chat_with_index
from src.main import chat_with_index


@tool
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from promptflow import tool
from chat_with_index.find_context import find_context
from src.find_context import find_context


@tool
def find_context_tool(question: str, mlindex_uri: str):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from promptflow import tool
from chat_with_index.qna import qna
from src.qna import qna


@tool
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from promptflow import tool
from chat_with_index.rewrite_question import rewrite_question
from src.rewrite_question import rewrite_question


@tool
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from qna import qna
from find_context import find_context
from rewrite_question import rewrite_question

# from build_index import create_faiss_index
# from utils.lock import acquire_lock

Expand Down Expand Up @@ -52,7 +53,9 @@ def main_loop(mlindex_uri: str):


def main():
parser = argparse.ArgumentParser(description="Ask questions about the contents of an MLIndex.")
parser = argparse.ArgumentParser(
description="Ask questions about the contents of an MLIndex."
)
parser.add_argument("mlindex_uri", help="URI to MLIndex")
args = parser.parse_args()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,25 @@
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
ml_client = MLClient.from_config(
credential=DefaultAzureCredential(), path="config.json"
)

acs_connection = ml_client.connections.get("azureml-rag-acs")
aoai_connection = ml_client.connections.get("azureml-rag-oai")

# %% https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/wikipedia.html
from langchain.document_loaders import WikipediaLoader

docs = WikipediaLoader(query='HUNTER X HUNTER', load_max_docs=10).load()
docs = WikipediaLoader(query="HUNTER X HUNTER", load_max_docs=10).load()
len(docs)

# %%
from langchain.text_splitter import MarkdownTextSplitter

split_docs = MarkdownTextSplitter.from_tiktoken_encoder(chunk_size=1024).split_documents(docs)
split_docs = MarkdownTextSplitter.from_tiktoken_encoder(
chunk_size=1024
).split_documents(docs)

# %%
from azureml.rag.mlindex import MLIndex
Expand All @@ -35,14 +39,12 @@
embeddings_model="azure_open_ai://deployment/text-embedding-ada-002/model/text-embedding-ada-002",
embeddings_connection=aoai_connection,
embeddings_container="./.embeddings_cache/hunter_x_hunter_aoai_acs",
index_type='acs',
index_type="acs",
index_connection=acs_connection,
index_config={
'index_name': 'hunter_x_hunter_aoai_acs'
}
index_config={"index_name": "hunter_x_hunter_aoai_acs"},
)

# %% Query documents, use with inferencing framework
index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search('What is bungie gum?', k=5)
docs = index.similarity_search("What is bungie gum?", k=5)
print(docs)
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,16 @@

# Process data into FAISS Index using HuggingFace embeddings
mlindex = MLIndex.from_files(
source_uri='../',
source_glob='**/*',
source_uri="../",
source_glob="**/*",
chunk_size=200,
embeddings_model="azure_open_ai://deployment/text-embedding-ada-002/model/text-embedding-ada-002",
embeddings_connection=aoai_connection,
embeddings_container="./.embeddings_cache/mlindex_docs_aoai_acs",
index_type='acs',
index_type="acs",
index_connection=acs_connection,
index_config={
'index_name': 'mlindex_docs_aoai_acs'
},
output_path="./acs_open_ai_index"
index_config={"index_name": "mlindex_docs_aoai_acs"},
output_path="./acs_open_ai_index",
)

# %% Load MLIndex from local
Expand All @@ -40,5 +38,5 @@

# %% Query documents, use with inferencing framework
index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search('Topic in my data.', k=5)
docs = index.similarity_search("Topic in my data.", k=5)
print(docs)
Loading

0 comments on commit ceedc9c

Please sign in to comment.