Rename chat-with-index internal code to src and apply various black f…

…ormatting fixes.
Azure · Sep 7, 2023 · ceedc9c · ceedc9c
1 parent 5317551
commit ceedc9c
Show file tree

Hide file tree

Showing 26 changed files with 98 additions and 72 deletions.
diff --git a/sdk/python/generative-ai/rag/pup_refresh/data_index_job/cog_search_docs_faiss_mlindex.py b/sdk/python/generative-ai/rag/pup_refresh/data_index_job/cog_search_docs_faiss_mlindex.py
@@ -10,10 +10,19 @@
 from azure.ai.ml import MLClient
 from azure.identity import DefaultAzureCredential
 
-ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
+ml_client = MLClient.from_config(
+    credential=DefaultAzureCredential(), path="config.json"
+)
 
 # %% Create DataIndex configuration
-from azureml.rag.dataindex.entities import Data, DataIndex, IndexSource, CitationRegex, Embedding, IndexStore
+from azureml.rag.dataindex.entities import (
+    Data,
+    DataIndex,
+    IndexSource,
+    CitationRegex,
+    Embedding,
+    IndexStore,
+)
 
 asset_name = "azure_search_docs_aoai_faiss"
 
@@ -29,20 +38,17 @@
         citation_url="https://learn.microsoft.com/en-us/azure",
         # Remove articles from the final citation url and remove the file extension so url points to hosted docs, not GitHub.
         citation_url_replacement_regex=CitationRegex(
-            match_pattern="(.*)/articles/(.*)(\\.[^.]+)$",
-            replacement_pattern="\\1/\\2"
-        )
+            match_pattern="(.*)/articles/(.*)(\\.[^.]+)$", replacement_pattern="\\1/\\2"
+        ),
     ),
     embedding=Embedding(
         model="text-embedding-ada-002",
         connection="azureml-rag-oai",
         cache_path=f"azureml://datastores/workspaceblobstore/paths/embeddings_cache/{asset_name}",
     ),
-    index=IndexStore(
-        type="faiss"
-    ),
+    index=IndexStore(type="faiss"),
     # name is replaced with a unique value each time the job is run
-    path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}"
+    path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}",
 )
 
 # %% Use git_clone Component to clone Azure Search docs from github
@@ -54,17 +60,17 @@
 from azure.ai.ml.dsl import pipeline
 from azureml.rag.dataindex.data_index import index_data
 
+
 @pipeline(default_compute="serverless")
 def git_to_faiss(
     git_url,
     branch_name="",
     git_connection_id="",
 ):
-    git_clone = git_clone_component(
-        git_repository=git_url,
-        branch_name=branch_name
-    )
-    git_clone.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_GIT"] = git_connection_id
+    git_clone = git_clone_component(git_repository=git_url, branch_name=branch_name)
+    git_clone.environment_variables[
+        "AZUREML_WORKSPACE_CONNECTION_ID_GIT"
+    ] = git_connection_id
 
     index_job = index_data(
         description=data_index.description,
@@ -75,10 +81,11 @@ def git_to_faiss(
 
     return index_job.outputs
 
+
 # %%
 git_index_job = git_to_faiss("https://github.com/MicrosoftDocs/azure-docs.git")
 # Ensure repo cloned each run to get latest, comment out to have first clone reused.
-git_index_job.settings.force_rerun = True 
+git_index_job.settings.force_rerun = True
 
 # %% Submit the DataIndex Job
 git_index_run = ml_client.jobs.create_or_update(
@@ -100,7 +107,7 @@ def git_to_faiss(
 mlindex = MLIndex(mlindex_docs_index_asset)
 
 index = mlindex.as_langchain_vectorstore()
-docs = index.similarity_search('How can I enable Semantic Search on my Index?', k=5)
+docs = index.similarity_search("How can I enable Semantic Search on my Index?", k=5)
 docs
 
 # %% Take a look at those chunked docs
@@ -131,10 +138,7 @@ def git_to_faiss(
 with open(flow_path / "flow.dag.yaml", "r") as f:
     flow_yaml = f.read()
     flow_yaml = re.sub(
-        r"path: (.*)# Index uri",
-        f"path: {mlindex_path} # Index uri",
-        flow_yaml,
-        re.M
+        r"path: (.*)# Index uri", f"path: {mlindex_path} # Index uri", flow_yaml, re.M
     )
 with open(flow_path / "flow.dag.yaml", "w") as f:
     f.write(flow_yaml)

diff --git a/sdk/python/generative-ai/rag/pup_refresh/data_index_job/local_docs_to_acs_mlindex.py b/sdk/python/generative-ai/rag/pup_refresh/data_index_job/local_docs_to_acs_mlindex.py
@@ -9,7 +9,9 @@
 from azure.ai.ml import MLClient, load_data
 from azure.identity import DefaultAzureCredential
 
-ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
+ml_client = MLClient.from_config(
+    credential=DefaultAzureCredential(), path="config.json"
+)
 
 # %% Load DataIndex configuration from file
 data_index = load_data("local_docs_to_acs_mlindex.yaml")
@@ -31,7 +33,7 @@
 mlindex = MLIndex(mlindex_docs_index_asset)
 
 index = mlindex.as_langchain_vectorstore()
-docs = index.similarity_search('What is an MLIndex?', k=5)
+docs = index.similarity_search("What is an MLIndex?", k=5)
 docs
 
 # %% Take a look at those chunked docs
@@ -41,4 +43,3 @@
     print(json.dumps({"content": doc.page_content, **doc.metadata}, indent=2))
 
 # %% Try it out with Promptflow
-
diff --git a/sdk/python/generative-ai/rag/pup_refresh/data_index_job/s3_to_acs_mlindex.py b/sdk/python/generative-ai/rag/pup_refresh/data_index_job/s3_to_acs_mlindex.py
@@ -9,10 +9,18 @@
 from azure.ai.ml import MLClient
 from azure.identity import DefaultAzureCredential
 
-ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
+ml_client = MLClient.from_config(
+    credential=DefaultAzureCredential(), path="config.json"
+)
 
 # %% Create DataIndex configuration
-from azureml.rag.dataindex.entities import Data, DataIndex, IndexSource, Embedding, IndexStore
+from azureml.rag.dataindex.entities import (
+    Data,
+    DataIndex,
+    IndexSource,
+    Embedding,
+    IndexStore,
+)
 
 asset_name = "s3_aoai_acs"
 
@@ -36,7 +44,7 @@
         connection="azureml-rag-acs",
     ),
     # name is replaced with a unique value each time the job is run
-    path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}"
+    path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}",
 )
 
 # %% Create the DataIndex Job to be scheduled
@@ -55,13 +63,13 @@
 mlindex_docs_index_asset = ml_client.data.get(data_index.name, label="latest")
 mlindex_docs_index_asset
 
-## %% Try it out with langchain by loading the MLIndex asset using the azureml-rag SDK
+# %% Try it out with langchain by loading the MLIndex asset using the azureml-rag SDK
 from azureml.rag.mlindex import MLIndex
 
 mlindex = MLIndex(mlindex_docs_index_asset)
 
 index = mlindex.as_langchain_vectorstore()
-docs = index.similarity_search('What is RAG?', k=5)
+docs = index.similarity_search("What is RAG?", k=5)
 docs
 
 # %% Take a look at those chunked docs

diff --git a/sdk/python/generative-ai/rag/pup_refresh/data_index_job/scheduled_s3_to_asc_mlindex.py b/sdk/python/generative-ai/rag/pup_refresh/data_index_job/scheduled_s3_to_asc_mlindex.py
@@ -9,10 +9,18 @@
 from azure.ai.ml import MLClient
 from azure.identity import DefaultAzureCredential
 
-ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
+ml_client = MLClient.from_config(
+    credential=DefaultAzureCredential(), path="config.json"
+)
 
 # %% Create DataIndex configuration
-from azureml.rag.dataindex.entities import Data, DataIndex, IndexSource, Embedding, IndexStore
+from azureml.rag.dataindex.entities import (
+    Data,
+    DataIndex,
+    IndexSource,
+    Embedding,
+    IndexStore,
+)
 
 asset_name = "s3_aoai_acs"
 
@@ -36,7 +44,7 @@
         connection="azureml-rag-acs",
     ),
     # name is replaced with a unique value each time the job is run
-    path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}"
+    path=f"azureml://datastores/workspaceblobstore/paths/indexes/{asset_name}/{{name}}",
 )
 
 # %% Create the DataIndex Job to be scheduled
@@ -47,7 +55,7 @@
     # The DataIndex Job will use the identity of the MLClient within the DataIndex Job to access source data.
     identity=UserIdentityConfiguration(),
     # Instead of submitting the Job and returning the Run a PipelineJob configuration is returned which can be used in with a Schedule.
-    submit_job=False
+    submit_job=False,
 )
 
 # %% Create Schedule for DataIndex Job
@@ -61,13 +69,16 @@
 recurrence_trigger = RecurrenceTrigger(
     frequency="day",
     interval=1,
-    #schedule=RecurrencePattern(hours=16, minutes=[15]),
+    # schedule=RecurrencePattern(hours=16, minutes=[15]),
     start_time=schedule_start_time,
     time_zone=TimeZone.UTC,
 )
 
 job_schedule = JobSchedule(
-    name=schedule_name, trigger=recurrence_trigger, create_job=index_job, properties=index_job.properties
+    name=schedule_name,
+    trigger=recurrence_trigger,
+    create_job=index_job,
+    properties=index_job.properties,
 )
 
 # %% Enable Schedule
@@ -89,7 +100,7 @@
 mlindex = MLIndex(onelake_s3_index_asset)
 
 index = mlindex.as_langchain_vectorstore()
-docs = index.similarity_search('What is RAG?', k=5)
+docs = index.similarity_search("What is RAG?", k=5)
 docs
 
 # %% Take a look at those chunked docs

diff --git a/...nerative-ai/rag/pup_refresh/flows/bring_your_own_data_chat_qna/generate_prompt_context.py b/...nerative-ai/rag/pup_refresh/flows/bring_your_own_data_chat_qna/generate_prompt_context.py
@@ -13,7 +13,6 @@ def format_doc(doc: dict):
 
     retrieved_docs = []
     for item in search_result:
-
         entity = SearchResultEntity.from_dict(item)
         content = entity.text or ""
 
@@ -23,9 +22,6 @@ def format_doc(doc: dict):
                 if URL_KEY in entity.metadata[SOURCE_KEY]:
                     source = entity.metadata[SOURCE_KEY][URL_KEY] or ""
 
-        retrieved_docs.append({
-            "Content": content,
-            "Source": source
-        })
+        retrieved_docs.append({"Content": content, "Source": source})
     doc_string = "\n\n".join([format_doc(doc) for doc in retrieved_docs])
     return doc_string
diff --git a/sdk/python/generative-ai/rag/pup_refresh/flows/chat-with-index/chat_with_index_tool.py b/sdk/python/generative-ai/rag/pup_refresh/flows/chat-with-index/chat_with_index_tool.py
@@ -1,5 +1,5 @@
 from promptflow import tool
-from chat_with_index.main import chat_with_index
+from src.main import chat_with_index
 
 
 @tool

diff --git a/sdk/python/generative-ai/rag/pup_refresh/flows/chat-with-index/find_context_tool.py b/sdk/python/generative-ai/rag/pup_refresh/flows/chat-with-index/find_context_tool.py
@@ -1,5 +1,6 @@
 from promptflow import tool
-from chat_with_index.find_context import find_context
+from src.find_context import find_context
+
 
 @tool
 def find_context_tool(question: str, mlindex_uri: str):

diff --git a/sdk/python/generative-ai/rag/pup_refresh/flows/chat-with-index/qna_tool.py b/sdk/python/generative-ai/rag/pup_refresh/flows/chat-with-index/qna_tool.py
@@ -1,5 +1,5 @@
 from promptflow import tool
-from chat_with_index.qna import qna
+from src.qna import qna
 
 
 @tool

diff --git a/sdk/python/generative-ai/rag/pup_refresh/flows/chat-with-index/rewrite_question_tool.py b/sdk/python/generative-ai/rag/pup_refresh/flows/chat-with-index/rewrite_question_tool.py
@@ -1,5 +1,5 @@
 from promptflow import tool
-from chat_with_index.rewrite_question import rewrite_question
+from src.rewrite_question import rewrite_question
 
 
 @tool

diff --git a/...chat-with-index/chat_with_index/README.md → ...fresh/flows/chat-with-index/src/README.md b/...chat-with-index/chat_with_index/README.md → ...fresh/flows/chat-with-index/src/README.md
diff --git a/...at-with-index/chat_with_index/__init__.py → ...esh/flows/chat-with-index/src/__init__.py b/...at-with-index/chat_with_index/__init__.py → ...esh/flows/chat-with-index/src/__init__.py
diff --git a/...ith-index/chat_with_index/find_context.py → ...flows/chat-with-index/src/find_context.py b/...ith-index/chat_with_index/find_context.py → ...flows/chat-with-index/src/find_context.py
diff --git a/...s/chat-with-index/chat_with_index/main.py → ...refresh/flows/chat-with-index/src/main.py b/...s/chat-with-index/chat_with_index/main.py → ...refresh/flows/chat-with-index/src/main.py
@@ -5,6 +5,7 @@
 from qna import qna
 from find_context import find_context
 from rewrite_question import rewrite_question
+
 # from build_index import create_faiss_index
 # from utils.lock import acquire_lock
 
@@ -52,7 +53,9 @@ def main_loop(mlindex_uri: str):
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Ask questions about the contents of an MLIndex.")
+    parser = argparse.ArgumentParser(
+        description="Ask questions about the contents of an MLIndex."
+    )
     parser.add_argument("mlindex_uri", help="URI to MLIndex")
     args = parser.parse_args()
 

diff --git a/...ws/chat-with-index/chat_with_index/qna.py → ..._refresh/flows/chat-with-index/src/qna.py b/...ws/chat-with-index/chat_with_index/qna.py → ..._refresh/flows/chat-with-index/src/qna.py
diff --git a/...-with-index/chat_with_index/qna_prompt.md → ...h/flows/chat-with-index/src/qna_prompt.md b/...-with-index/chat_with_index/qna_prompt.md → ...h/flows/chat-with-index/src/qna_prompt.md
diff --git a/...index/chat_with_index/rewrite_question.py → ...s/chat-with-index/src/rewrite_question.py b/...index/chat_with_index/rewrite_question.py → ...s/chat-with-index/src/rewrite_question.py
diff --git a/...hat_with_index/rewrite_question_prompt.md → ...with-index/src/rewrite_question_prompt.md b/...hat_with_index/rewrite_question_prompt.md → ...with-index/src/rewrite_question_prompt.md
diff --git a/...hat-with-index/chat_with_index/test.ipynb → ...resh/flows/chat-with-index/src/test.ipynb b/...hat-with-index/chat_with_index/test.ipynb → ...resh/flows/chat-with-index/src/test.ipynb
diff --git a/...h-index/chat_with_index/utils/__init__.py → ...ows/chat-with-index/src/utils/__init__.py b/...h-index/chat_with_index/utils/__init__.py → ...ows/chat-with-index/src/utils/__init__.py
diff --git a/...th-index/chat_with_index/utils/logging.py → ...lows/chat-with-index/src/utils/logging.py b/...th-index/chat_with_index/utils/logging.py → ...lows/chat-with-index/src/utils/logging.py
diff --git a/...t-with-index/chat_with_index/utils/oai.py → ...sh/flows/chat-with-index/src/utils/oai.py b/...t-with-index/chat_with_index/utils/oai.py → ...sh/flows/chat-with-index/src/utils/oai.py
diff --git a/...with-index/chat_with_index/utils/retry.py → .../flows/chat-with-index/src/utils/retry.py b/...with-index/chat_with_index/utils/retry.py → .../flows/chat-with-index/src/utils/retry.py
diff --git a/sdk/python/generative-ai/rag/pup_refresh/mlindex_local/langchain_docs_to_mlindex.py b/sdk/python/generative-ai/rag/pup_refresh/mlindex_local/langchain_docs_to_mlindex.py
@@ -10,21 +10,25 @@
 from azure.ai.ml import MLClient
 from azure.identity import DefaultAzureCredential
 
-ml_client = MLClient.from_config(credential=DefaultAzureCredential(), path="config.json")
+ml_client = MLClient.from_config(
+    credential=DefaultAzureCredential(), path="config.json"
+)
 
 acs_connection = ml_client.connections.get("azureml-rag-acs")
 aoai_connection = ml_client.connections.get("azureml-rag-oai")
 
 # %% https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/wikipedia.html
 from langchain.document_loaders import WikipediaLoader
 
-docs = WikipediaLoader(query='HUNTER X HUNTER', load_max_docs=10).load()
+docs = WikipediaLoader(query="HUNTER X HUNTER", load_max_docs=10).load()
 len(docs)
 
 # %%
 from langchain.text_splitter import MarkdownTextSplitter
 
-split_docs = MarkdownTextSplitter.from_tiktoken_encoder(chunk_size=1024).split_documents(docs)
+split_docs = MarkdownTextSplitter.from_tiktoken_encoder(
+    chunk_size=1024
+).split_documents(docs)
 
 # %%
 from azureml.rag.mlindex import MLIndex
@@ -35,14 +39,12 @@
     embeddings_model="azure_open_ai://deployment/text-embedding-ada-002/model/text-embedding-ada-002",
     embeddings_connection=aoai_connection,
     embeddings_container="./.embeddings_cache/hunter_x_hunter_aoai_acs",
-    index_type='acs',
+    index_type="acs",
     index_connection=acs_connection,
-    index_config={
-        'index_name': 'hunter_x_hunter_aoai_acs'
-    }
+    index_config={"index_name": "hunter_x_hunter_aoai_acs"},
 )
 
 # %% Query documents, use with inferencing framework
 index = mlindex.as_langchain_vectorstore()
-docs = index.similarity_search('What is bungie gum?', k=5)
+docs = index.similarity_search("What is bungie gum?", k=5)
 print(docs)
diff --git a/sdk/python/generative-ai/rag/pup_refresh/mlindex_local/local_docs_to_acs_aoai_mlindex.py b/sdk/python/generative-ai/rag/pup_refresh/mlindex_local/local_docs_to_acs_aoai_mlindex.py
@@ -19,18 +19,16 @@
 
 # Process data into FAISS Index using HuggingFace embeddings
 mlindex = MLIndex.from_files(
-    source_uri='../',
-    source_glob='**/*',
+    source_uri="../",
+    source_glob="**/*",
     chunk_size=200,
     embeddings_model="azure_open_ai://deployment/text-embedding-ada-002/model/text-embedding-ada-002",
     embeddings_connection=aoai_connection,
     embeddings_container="./.embeddings_cache/mlindex_docs_aoai_acs",
-    index_type='acs',
+    index_type="acs",
     index_connection=acs_connection,
-    index_config={
-        'index_name': 'mlindex_docs_aoai_acs'
-    },
-    output_path="./acs_open_ai_index"
+    index_config={"index_name": "mlindex_docs_aoai_acs"},
+    output_path="./acs_open_ai_index",
 )
 
 # %% Load MLIndex from local
@@ -40,5 +38,5 @@
 
 # %% Query documents, use with inferencing framework
 index = mlindex.as_langchain_vectorstore()
-docs = index.similarity_search('Topic in my data.', k=5)
+docs = index.similarity_search("Topic in my data.", k=5)
 print(docs)