diff --git a/app/backend/app.py b/app/backend/app.py
index 226ef98506..386ce6881a 100644
--- a/app/backend/app.py
+++ b/app/backend/app.py
@@ -714,9 +714,10 @@ def create_app():
# Log levels should be one of https://docs.python.org/3/library/logging.html#logging-levels
# Set root level to WARNING to avoid seeing overly verbose logs from SDKS
logging.basicConfig(level=logging.WARNING)
- # Set the app logger level to INFO by default
- default_level = "INFO"
- app.logger.setLevel(os.getenv("APP_LOG_LEVEL", default_level))
+ # Set our own logger levels to INFO by default
+ app_level = os.getenv("APP_LOG_LEVEL", "INFO")
+ app.logger.setLevel(os.getenv("APP_LOG_LEVEL", app_level))
+ logging.getLogger("scripts").setLevel(app_level)
if allowed_origin := os.getenv("ALLOWED_ORIGIN"):
app.logger.info("ALLOWED_ORIGIN is set, enabling CORS for %s", allowed_origin)
diff --git a/app/backend/load_azd_env.py b/app/backend/load_azd_env.py
new file mode 100644
index 0000000000..5a6334ab6f
--- /dev/null
+++ b/app/backend/load_azd_env.py
@@ -0,0 +1,23 @@
+import json
+import logging
+import subprocess
+
+from dotenv import load_dotenv
+
+logger = logging.getLogger("scripts")
+
+
+def load_azd_env():
+ """Get path to current azd env file and load file using python-dotenv"""
+ result = subprocess.run("azd env list -o json", shell=True, capture_output=True, text=True)
+ if result.returncode != 0:
+ raise Exception("Error loading azd env")
+ env_json = json.loads(result.stdout)
+ env_file_path = None
+ for entry in env_json:
+ if entry["IsDefault"]:
+ env_file_path = entry["DotEnvPath"]
+ if not env_file_path:
+ raise Exception("No default azd env file found")
+ logger.info(f"Loading azd env from {env_file_path}")
+ load_dotenv(env_file_path, override=True)
diff --git a/app/backend/main.py b/app/backend/main.py
index 0a23b5abbf..0f2914a483 100644
--- a/app/backend/main.py
+++ b/app/backend/main.py
@@ -1,3 +1,12 @@
+import os
+
from app import create_app
+from load_azd_env import load_azd_env
+
+# WEBSITE_HOSTNAME is always set by App Service, RUNNING_IN_PRODUCTION is set in main.bicep
+RUNNING_ON_AZURE = os.getenv("WEBSITE_HOSTNAME") is not None or os.getenv("RUNNING_IN_PRODUCTION") is not None
+
+if not RUNNING_ON_AZURE:
+ load_azd_env()
app = create_app()
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
index deea428139..696cf6397f 100644
--- a/app/backend/prepdocs.py
+++ b/app/backend/prepdocs.py
@@ -1,12 +1,14 @@
import argparse
import asyncio
import logging
+import os
from typing import Optional, Union
from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
+from load_azd_env import load_azd_env
from prepdocslib.blobmanager import BlobManager
from prepdocslib.embeddings import (
AzureOpenAIEmbeddingService,
@@ -31,7 +33,7 @@
from prepdocslib.textparser import TextParser
from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]:
@@ -154,10 +156,9 @@ def setup_file_processors(
local_html_parser: bool = False,
search_images: bool = False,
):
- html_parser: Parser
- pdf_parser: Parser
- doc_int_parser: DocumentAnalysisParser
+ sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
+ doc_int_parser: Optional[DocumentAnalysisParser] = None
# check if Azure Document Intelligence credentials are provided
if document_intelligence_service is not None:
documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
@@ -167,31 +168,50 @@ def setup_file_processors(
endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
credential=documentintelligence_creds,
)
+
+ pdf_parser: Optional[Parser] = None
if local_pdf_parser or document_intelligence_service is None:
pdf_parser = LocalPdfParser()
- else:
+ elif document_intelligence_service is not None:
pdf_parser = doc_int_parser
+ else:
+ logger.warning("No PDF parser available")
+
+ html_parser: Optional[Parser] = None
if local_html_parser or document_intelligence_service is None:
html_parser = LocalHTMLParser()
- else:
+ elif document_intelligence_service is not None:
html_parser = doc_int_parser
- sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
- return {
- ".pdf": FileProcessor(pdf_parser, sentence_text_splitter),
- ".html": FileProcessor(html_parser, sentence_text_splitter),
+ else:
+ logger.warning("No HTML parser available")
+
+ # These file formats can always be parsed:
+ file_processors = {
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
- ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
- ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
- ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
- ".png": FileProcessor(doc_int_parser, sentence_text_splitter),
- ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
- ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
- ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter),
- ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter),
- ".heic": FileProcessor(doc_int_parser, sentence_text_splitter),
".md": FileProcessor(TextParser(), sentence_text_splitter),
".txt": FileProcessor(TextParser(), sentence_text_splitter),
}
+ # These require either a Python package or Document Intelligence
+ if pdf_parser is not None:
+ file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)})
+ if html_parser is not None:
+ file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)})
+ # These file formats require Document Intelligence
+ if doc_int_parser is not None:
+ file_processors.update(
+ {
+ ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
+ ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
+ ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
+ ".png": FileProcessor(doc_int_parser, sentence_text_splitter),
+ ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter),
+ ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter),
+ ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter),
+ ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter),
+ ".heic": FileProcessor(doc_int_parser, sentence_text_splitter),
+ }
+ )
+ return file_processors
def setup_image_embeddings_service(
@@ -218,111 +238,19 @@ async def main(strategy: Strategy, setup_index: bool = True):
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
- epilog="Example: prepdocs.py '.\\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v",
+ epilog="Example: prepdocs.py '.\\data\*' -v",
)
parser.add_argument("files", nargs="?", help="Files to be processed")
- parser.add_argument(
- "--datalakestorageaccount", required=False, help="Optional. Azure Data Lake Storage Gen2 Account name"
- )
- parser.add_argument(
- "--datalakefilesystem",
- required=False,
- default="gptkbcontainer",
- help="Optional. Azure Data Lake Storage Gen2 filesystem name",
- )
- parser.add_argument(
- "--datalakepath",
- required=False,
- help="Optional. Azure Data Lake Storage Gen2 filesystem path containing files to index. If omitted, index the entire filesystem",
- )
- parser.add_argument(
- "--datalakekey", required=False, help="Optional. Use this key when authenticating to Azure Data Lake Gen2"
- )
- parser.add_argument(
- "--useacls", action="store_true", help="Store ACLs from Azure Data Lake Gen2 Filesystem in the search index"
- )
+
parser.add_argument(
"--category", help="Value for the category field in the search index for all sections indexed in this run"
)
parser.add_argument(
"--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage"
)
- parser.add_argument("--storageaccount", help="Azure Blob Storage account name")
- parser.add_argument("--container", help="Azure Blob Storage container name")
- parser.add_argument("--storageresourcegroup", help="Azure blob storage resource group")
- parser.add_argument(
- "--storagekey",
- required=False,
- help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)",
- )
- parser.add_argument(
- "--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate)"
- )
- parser.add_argument(
- "--subscriptionid",
- required=False,
- help="Optional. Use this to define managed identity connection string in integrated vectorization",
- )
- parser.add_argument(
- "--searchservice",
- help="Name of the Azure AI Search service where content should be indexed (must exist already)",
- )
- parser.add_argument(
- "--searchserviceassignedid",
- required=False,
- help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
- )
- parser.add_argument(
- "--index",
- help="Name of the Azure AI Search index where content should be indexed (will be created if it doesn't exist)",
- )
- parser.add_argument(
- "--searchkey",
- required=False,
- help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)",
- )
- parser.add_argument(
- "--searchanalyzername",
- required=False,
- default="en.microsoft",
- help="Optional. Name of the Azure AI Search analyzer to use for the content field in the index",
- )
- parser.add_argument("--openaihost", help="Host of the API used to compute embeddings ('azure' or 'openai')")
- parser.add_argument("--openaiservice", help="Name of the Azure OpenAI service used to compute embeddings")
- parser.add_argument(
- "--openaideployment",
- help="Name of the Azure OpenAI model deployment for an embedding model ('text-embedding-ada-002' recommended)",
- )
- parser.add_argument(
- "--openaimodelname", help="Name of the Azure OpenAI embedding model ('text-embedding-ada-002' recommended)"
- )
- parser.add_argument(
- "--openaidimensions",
- required=False,
- default=1536,
- type=int,
- help="Dimensions for the embedding model (defaults to 1536 for 'text-embedding-ada-002')",
- )
- parser.add_argument(
- "--novectors",
- action="store_true",
- help="Don't compute embeddings for the sections (e.g. don't call the OpenAI embeddings API during indexing)",
- )
parser.add_argument(
"--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections"
)
-
- parser.add_argument(
- "--openaicustomurl",
- required=False,
- help="Optional. Use this custom OpenAI URL instead of the default OpenAI URL",
- )
- parser.add_argument(
- "--openaikey",
- required=False,
- help="Optional. Use this OpenAI account key instead of the current Azure user identity to login.",
- )
- parser.add_argument("--openaiorg", required=False, help="This is required only when using non-Azure endpoints.")
parser.add_argument(
"--remove",
action="store_true",
@@ -333,42 +261,32 @@ async def main(strategy: Strategy, setup_index: bool = True):
action="store_true",
help="Remove all blobs from blob storage and documents from the search index",
)
+
+ # Optional key specification:
parser.add_argument(
- "--localpdfparser",
- action="store_true",
- help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Document Intelligence service to extract text, tables and layout from the documents",
- )
- parser.add_argument(
- "--localhtmlparser",
- action="store_true",
- help="Use Beautiful soap local HTML parser instead of Azure Document Intelligence service to extract text, tables and layout from the documents",
- )
- parser.add_argument(
- "--documentintelligenceservice",
+ "--searchkey",
required=False,
- help="Optional. Name of the Azure Document Intelligence service which will be used to extract text, tables and layout from the documents (must exist already)",
+ help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)",
)
parser.add_argument(
- "--documentintelligencekey",
+ "--storagekey",
required=False,
- help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)",
+ help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)",
)
parser.add_argument(
- "--searchimages",
- action="store_true",
- required=False,
- help="Optional. Generate image embeddings to enable each page to be searched as an image",
+ "--datalakekey", required=False, help="Optional. Use this key when authenticating to Azure Data Lake Gen2"
)
parser.add_argument(
- "--visionendpoint",
+ "--documentintelligencekey",
required=False,
- help="Optional, required if --searchimages is specified. Endpoint of Azure AI Vision service to use when embedding images.",
+ help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)",
)
parser.add_argument(
- "--useintvectorization",
+ "--searchserviceassignedid",
required=False,
- help="Required if --useintvectorization is specified. Enable Integrated vectorizer indexer support which is in preview)",
+ help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
)
+
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
args = parser.parse_args()
@@ -378,12 +296,17 @@ async def main(strategy: Strategy, setup_index: bool = True):
# to avoid seeing the noisy INFO level logs from the Azure SDKs
logger.setLevel(logging.INFO)
- use_int_vectorization = args.useintvectorization and args.useintvectorization.lower() == "true"
+ load_azd_env()
+
+ use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true"
+ use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true"
+ use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None
+ dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false"
- # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them
- if args.tenantid:
- logger.info("Connecting to Azure services using the azd credential for tenant %s", args.tenantid)
- azd_credential = AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60)
+ # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments.
+ if tenant_id := os.getenv("AZURE_TENANT_ID"):
+ logger.info("Connecting to Azure services using the azd credential for tenant %s", tenant_id)
+ azd_credential = AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60)
else:
logger.info("Connecting to Azure services using the azd credential for home tenant")
azd_credential = AzureDeveloperCliCredential(process_timeout=60)
@@ -400,40 +323,51 @@ async def main(strategy: Strategy, setup_index: bool = True):
search_info = loop.run_until_complete(
setup_search_info(
- search_service=args.searchservice,
- index_name=args.index,
+ search_service=os.environ["AZURE_SEARCH_SERVICE"],
+ index_name=os.environ["AZURE_SEARCH_INDEX"],
azure_credential=azd_credential,
search_key=clean_key_if_exists(args.searchkey),
)
)
blob_manager = setup_blob_manager(
azure_credential=azd_credential,
- storage_account=args.storageaccount,
- storage_container=args.container,
- storage_resource_group=args.storageresourcegroup,
- subscription_id=args.subscriptionid,
- search_images=args.searchimages,
+ storage_account=os.environ["AZURE_STORAGE_ACCOUNT"],
+ storage_container=os.environ["AZURE_STORAGE_CONTAINER"],
+ storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"],
+ subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
+ search_images=use_gptvision,
storage_key=clean_key_if_exists(args.storagekey),
)
list_file_strategy = setup_list_file_strategy(
azure_credential=azd_credential,
local_files=args.files,
- datalake_storage_account=args.datalakestorageaccount,
- datalake_filesystem=args.datalakefilesystem,
- datalake_path=args.datalakepath,
+ datalake_storage_account=os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT"),
+ datalake_filesystem=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM"),
+ datalake_path=os.getenv("AZURE_ADLS_GEN2_FILESYSTEM_PATH"),
datalake_key=clean_key_if_exists(args.datalakekey),
)
+
+ openai_host = os.environ["OPENAI_HOST"]
+ openai_key = None
+ if os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"):
+ openai_key = os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE")
+ elif not openai_host.startswith("azure") and os.getenv("OPENAI_API_KEY"):
+ openai_key = os.getenv("OPENAI_API_KEY")
+
+ openai_dimensions = 1536
+ if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"):
+ openai_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"])
openai_embeddings_service = setup_embeddings_service(
azure_credential=azd_credential,
- openai_host=args.openaihost,
- openai_model_name=args.openaimodelname,
- openai_service=args.openaiservice,
- openai_custom_url=args.openaicustomurl,
- openai_deployment=args.openaideployment,
- openai_dimensions=args.openaidimensions,
- openai_key=clean_key_if_exists(args.openaikey),
- openai_org=args.openaiorg,
- disable_vectors=args.novectors,
+ openai_host=openai_host,
+ openai_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"],
+ openai_service=os.getenv("AZURE_OPENAI_SERVICE"),
+ openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"),
+ openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"),
+ openai_dimensions=openai_dimensions,
+ openai_key=clean_key_if_exists(openai_key),
+ openai_org=os.getenv("OPENAI_ORGANIZATION"),
+ disable_vectors=dont_use_vectors,
disable_batch_vectors=args.disablebatchvectors,
)
@@ -445,23 +379,25 @@ async def main(strategy: Strategy, setup_index: bool = True):
blob_manager=blob_manager,
document_action=document_action,
embeddings=openai_embeddings_service,
- subscription_id=args.subscriptionid,
+ subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
search_service_user_assigned_id=args.searchserviceassignedid,
- search_analyzer_name=args.searchanalyzername,
- use_acls=args.useacls,
+ search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
+ use_acls=use_acls,
category=args.category,
)
else:
file_processors = setup_file_processors(
azure_credential=azd_credential,
- document_intelligence_service=args.documentintelligenceservice,
+ document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"),
document_intelligence_key=clean_key_if_exists(args.documentintelligencekey),
- local_pdf_parser=args.localpdfparser,
- local_html_parser=args.localhtmlparser,
- search_images=args.searchimages,
+ local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true",
+ local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true",
+ search_images=use_gptvision,
)
image_embeddings_service = setup_image_embeddings_service(
- azure_credential=azd_credential, vision_endpoint=args.visionendpoint, search_images=args.searchimages
+ azure_credential=azd_credential,
+ vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"),
+ search_images=use_gptvision,
)
ingestion_strategy = FileStrategy(
@@ -472,8 +408,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
document_action=document_action,
embeddings=openai_embeddings_service,
image_embeddings=image_embeddings_service,
- search_analyzer_name=args.searchanalyzername,
- use_acls=args.useacls,
+ search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
+ use_acls=use_acls,
category=args.category,
)
diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py
index b9ada05f10..e9f18e795a 100644
--- a/app/backend/prepdocslib/blobmanager.py
+++ b/app/backend/prepdocslib/blobmanager.py
@@ -18,7 +18,7 @@
from .listfilestrategy import File
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
class BlobManager:
diff --git a/app/backend/prepdocslib/embeddings.py b/app/backend/prepdocslib/embeddings.py
index b9c5a9219f..c538952e72 100644
--- a/app/backend/prepdocslib/embeddings.py
+++ b/app/backend/prepdocslib/embeddings.py
@@ -17,7 +17,7 @@
)
from typing_extensions import TypedDict
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
class EmbeddingBatch:
diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py
index e8cab16983..26745e744d 100644
--- a/app/backend/prepdocslib/filestrategy.py
+++ b/app/backend/prepdocslib/filestrategy.py
@@ -8,7 +8,7 @@
from .searchmanager import SearchManager, Section
from .strategy import DocumentAction, SearchInfo, Strategy
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
async def parse_file(
diff --git a/app/backend/prepdocslib/htmlparser.py b/app/backend/prepdocslib/htmlparser.py
index 0acf88b050..a42579f640 100644
--- a/app/backend/prepdocslib/htmlparser.py
+++ b/app/backend/prepdocslib/htmlparser.py
@@ -7,7 +7,7 @@
from .page import Page
from .parser import Parser
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
def cleanup_data(data: str) -> str:
diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py
index 0c475b9f52..58b84a1689 100644
--- a/app/backend/prepdocslib/integratedvectorizerstrategy.py
+++ b/app/backend/prepdocslib/integratedvectorizerstrategy.py
@@ -28,7 +28,7 @@
from .searchmanager import SearchManager
from .strategy import DocumentAction, SearchInfo, Strategy
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
class IntegratedVectorizerStrategy(Strategy):
diff --git a/app/backend/prepdocslib/listfilestrategy.py b/app/backend/prepdocslib/listfilestrategy.py
index bd6a48d651..3c8fcd27b0 100644
--- a/app/backend/prepdocslib/listfilestrategy.py
+++ b/app/backend/prepdocslib/listfilestrategy.py
@@ -13,7 +13,7 @@
DataLakeServiceClient,
)
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
class File:
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
index 33335aadd6..6604110020 100644
--- a/app/backend/prepdocslib/pdfparser.py
+++ b/app/backend/prepdocslib/pdfparser.py
@@ -11,7 +11,7 @@
from .page import Page
from .parser import Parser
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
class LocalPdfParser(Parser):
diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py
index 496e5ca30a..8757926000 100644
--- a/app/backend/prepdocslib/searchmanager.py
+++ b/app/backend/prepdocslib/searchmanager.py
@@ -26,7 +26,7 @@
from .strategy import SearchInfo
from .textsplitter import SplitPage
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
class Section:
diff --git a/app/backend/prepdocslib/textsplitter.py b/app/backend/prepdocslib/textsplitter.py
index 5d899e691e..30b0c1ad77 100644
--- a/app/backend/prepdocslib/textsplitter.py
+++ b/app/backend/prepdocslib/textsplitter.py
@@ -6,7 +6,7 @@
from .page import Page, SplitPage
-logger = logging.getLogger("ingester")
+logger = logging.getLogger("scripts")
class TextSplitter(ABC):
diff --git a/app/backend/requirements.in b/app/backend/requirements.in
index ba7aed8fb0..be5dd02754 100644
--- a/app/backend/requirements.in
+++ b/app/backend/requirements.in
@@ -29,3 +29,4 @@ beautifulsoup4
types-beautifulsoup4
msgraph-sdk==1.1.0
openai-messages-token-helper
+python-dotenv
diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
index 2234f99278..8bcb466edf 100644
--- a/app/backend/requirements.txt
+++ b/app/backend/requirements.txt
@@ -348,6 +348,8 @@ python-dateutil==2.9.0.post0
# microsoft-kiota-serialization-text
# pendulum
# time-machine
+python-dotenv==1.0.1
+ # via -r requirements.in
quart==0.19.6
# via
# -r requirements.in
diff --git a/app/frontend/src/pages/chat/Chat.tsx b/app/frontend/src/pages/chat/Chat.tsx
index 001a9b8712..1129e0f590 100644
--- a/app/frontend/src/pages/chat/Chat.tsx
+++ b/app/frontend/src/pages/chat/Chat.tsx
@@ -131,7 +131,7 @@ const Chat = () => {
if (event["context"] && event["context"]["data_points"]) {
event["message"] = event["delta"];
askResponse = event as ChatAppResponse;
- } else if (event["delta"]["content"]) {
+ } else if (event["delta"] && event["delta"]["content"]) {
setIsLoading(false);
await updateState(event["delta"]["content"]);
} else if (event["context"]) {
diff --git a/app/start.sh b/app/start.sh
index ec7d64067a..d703654e69 100755
--- a/app/start.sh
+++ b/app/start.sh
@@ -1,21 +1,5 @@
#!/bin/sh
-echo ""
-echo "Loading azd .env file from current environment"
-echo ""
-
-while IFS='=' read -r key value; do
- value=$(echo "$value" | sed 's/^"//' | sed 's/"$//')
- export "$key=$value"
-done <
+python ./scripts/manageacl.py -v --acl-action update_storage_urls --url
```
Going forward, all uploaded documents will have their `storageUrl` set in the search index.
diff --git a/docs/login_and_acl.md b/docs/login_and_acl.md
index 0ec1b549bc..1af26dd249 100644
--- a/docs/login_and_acl.md
+++ b/docs/login_and_acl.md
@@ -20,7 +20,7 @@
This guide demonstrates how to add an optional login and document level access control system to the sample. This system can be used to restrict access to indexed data to specific users based on what [Microsoft Entra groups](https://learn.microsoft.com/entra/fundamentals/how-to-manage-groups) they are a part of, or their [user object id](https://learn.microsoft.com/partner-center/find-ids-and-domain-names#find-the-user-object-id).
-![AppLoginArchitecture](./images/applogincomponents.png)
+![AppLoginArchitecture](/docs/images/applogincomponents.png)
## Requirements
@@ -37,7 +37,7 @@ Two Microsoft Entra applications must be registered in order to make the optiona
The easiest way to setup the two apps is to use the `azd` CLI. We've written scripts that will automatically create the two apps and configure them for use with the sample. To trigger the automatic setup, run the following commands:
1. Run `azd env set AZURE_USE_AUTHENTICATION true` to enable the login UI and use App Service authentication by default.
-1. Ensure access control is enabled on your search index. If your index doesn't exist yet, run prepdocs with `AZURE_USE_AUTHENTICATION` set to `true`. If your index already exists, run `pwsh ./scripts/manageacl.ps1 --acl-action enable_acls`.
+1. Ensure access control is enabled on your search index. If your index doesn't exist yet, run prepdocs with `AZURE_USE_AUTHENTICATION` set to `true`. If your index already exists, run `python ./scripts/manageacl.py --acl-action enable_acls`.
1. (Optional) To require access control when using the app, run `azd env set AZURE_ENFORCE_ACCESS_CONTROL true`. Authentication is always required to search on documents with access control assigned, regardless of if unauthenticated access is enabled or not.
1. (Optional) To allow authenticated users to search on documents that have no access controls assigned, even when access control is required, run `azd env set AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS true`.
1. (Optional) To allow unauthenticated users to use the app, even when access control is enforced, run `azd env set AZURE_ENABLE_UNAUTHENTICATED_ACCESS true`. `AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS` should also be set to true if you want unauthenticated users to be able to search on documents with no access control.
@@ -154,9 +154,9 @@ print(token.token)
- If your primary tenant restricts the ability to create Entra applications, you'll need to use a separate tenant to create the Entra applications. You can create a new tenant by following [these instructions](https://learn.microsoft.com/entra/identity-platform/quickstart-create-new-tenant). Then run `azd env set AZURE_AUTH_TENANT_ID ` before running `azd up`.
- If any Entra apps need to be recreated, you can avoid redeploying the app by [changing the app settings in the portal](https://learn.microsoft.com/azure/app-service/configure-common?tabs=portal#configure-app-settings). Any of the [required environment variables](#environment-variables-reference) can be changed. Once the environment variables have been changed, restart the web app.
-- It's possible a consent dialog will not appear when you log into the app for the first time. If this consent dialog doesn't appear, you will be unable to use the security filters because the API server app does not have permission to read your authorization information. A consent dialog can be forced to appear by adding `"prompt": "consent"` to the `loginRequest` property in [`authentication.py`](../app/backend/core/authentication.py)
+- It's possible a consent dialog will not appear when you log into the app for the first time. If this consent dialog doesn't appear, you will be unable to use the security filters because the API server app does not have permission to read your authorization information. A consent dialog can be forced to appear by adding `"prompt": "consent"` to the `loginRequest` property in [`authentication.py`](/app/backend/core/authentication.py)
- It's possible that your tenant admin has placed a restriction on consent to apps with [unverified publishers](https://learn.microsoft.com/entra/identity-platform/publisher-verification-overview). In this case, only admins may consent to the client and server apps, and normal user accounts are unable to use the login system until the admin consents on behalf of the entire organization.
-- It's possible that your tenant admin requires [admin approval of all new apps](https://learn.microsoft.com/entra/identity/enterprise-apps/manage-consent-requests). Regardless of whether you select the delegated or admin permissions, the app will not work without tenant admin consent.
+- It's possible that your tenant admin requires [admin approval of all new apps](https://learn.microsoft.com/entra/identity/enterprise-apps/manage-consent-requests). Regardless of whether you select the delegated or admin permissions, the app will not work without tenant admin consent. See this guide for [granting consent to an app](https://learn.microsoft.com/entra/identity/enterprise-apps/grant-admin-consent?pivots=portal).
## Adding data with document level access control
@@ -167,65 +167,74 @@ The sample supports 2 main strategies for adding data with document level access
### Using the Add Documents API
-Manually enable document level access control on a search index and manually set access control values using the [manageacl.ps1](../scripts/manageacl.ps1) script.
+Manually enable document level access control on a search index and manually set access control values using the [manageacl.py](/scripts/manageacl.py) script.
-Run `azd up` or use `azd env set` to manually set `AZURE_SEARCH_SERVICE` and `AZURE_SEARCH_INDEX` environment variables prior to running the script.
+Prior to running the script:
-The script supports the following commands. Note that the syntax is the same regardless of whether [manageacl.ps1](../scripts/manageacl.ps1) or [manageacl.sh](../scripts/manageacl.sh) is used. All commands support `-v` for verbose logging.
+- Run `azd up` or use `azd env set` to manually set the `AZURE_SEARCH_SERVICE` and `AZURE_SEARCH_INDEX` azd environment variables
+- Activate the Python virtual environment for your shell session
-- `./scripts/manageacl.ps1 --acl-action enable_acls`: Creates the required `oids` (User ID) and `groups` (Group IDs) [security filter](https://learn.microsoft.com/azure/search/search-security-trimming-for-azure-search) fields for document level access control on your index, as well as the `storageUrl` field for storing the Blob storage URL. Does nothing if these fields already exist.
+The script supports the following commands. All commands support `-v` for verbose logging.
+
+- `python ./scripts/manageacl.py --acl-action enable_acls`: Creates the required `oids` (User ID) and `groups` (Group IDs) [security filter](https://learn.microsoft.com/azure/search/search-security-trimming-for-azure-search) fields for document level access control on your index, as well as the `storageUrl` field for storing the Blob storage URL. Does nothing if these fields already exist.
Example usage:
```shell
- ./scripts/manageacl.ps1 -v --acl-action enable_acls
+ python ./scripts/manageacl.py -v --acl-action enable_acls
```
-- `./scripts/manageacl.ps1 --acl-type [oids or groups]--acl-action view --url [https://url.pdf]`: Prints access control values associated with either User IDs or Group IDs for the document at the specified URL.
+- `python ./scripts/manageacl.py --acl-type [oids or groups]--acl-action view --url [https://url.pdf]`: Prints access control values associated with either User IDs or Group IDs for the document at the specified URL.
Example to view all Group IDs:
```shell
- ./scripts/manageacl.ps1 -v --acl-type groups --acl-action view --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
+ python ./scripts/manageacl.py -v --acl-type groups --acl-action view --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
```
-- `./scripts/manageacl.ps1 --url [https://url.pdf] --acl-type [oids or groups]--acl-action add --acl [ID of group or user]`: Adds an access control value associated with either User IDs or Group IDs for the document at the specified URL.
+- `python ./scripts/manageacl.py --acl-type [oids or groups]--acl-action add --acl [ID of group or user] --url [https://url.pdf]`: Adds an access control value associated with either User IDs or Group IDs for the document at the specified URL.
Example to add a Group ID:
```shell
- ./scripts/manageacl.ps1 -v --acl-type groups --acl-action add --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
+ python ./scripts/manageacl.py -v --acl-type groups --acl-action add --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
```
-- `./scripts/manageacl.ps1 --url [https://url.pdf] --acl-type [oids or groups]--acl-action remove_all`: Removes all access control values associated with either User IDs or Group IDs for a specific document.
+- `python ./scripts/manageacl.py --acl-type [oids or groups]--acl-action remove_all --url [https://url.pdf]`: Removes all access control values associated with either User IDs or Group IDs for a specific document.
Example to remove all Group IDs:
```shell
- ./scripts/manageacl.ps1 -v --acl-type groups --acl-action remove_all --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
+ python ./scripts/manageacl.py -v --acl-type groups --acl-action remove_all --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
```
-- `./scripts/manageacl.ps1 --url [https://url.pdf] --acl-type [oids or groups]--acl-action remove --acl [ID of group or user]`: Removes an access control value associated with either User IDs or Group IDs for a specific document.
+- `python ./scripts/manageacl.py --url [https://url.pdf] --acl-type [oids or groups]--acl-action remove --acl [ID of group or user]`: Removes an access control value associated with either User IDs or Group IDs for a specific document.
Example to remove a specific User ID:
```shell
- ./scripts/manageacl.ps1 -v --acl-type oids --acl-action remove --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
+ python ./scripts/manageacl.py -v --acl-type oids --acl-action remove --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
```
### Azure Data Lake Storage Gen2 Setup
-[Azure Data Lake Storage Gen2](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) implements an [access control model](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control) that can be used for document level access control. The [adlsgen2setup.ps1](../scripts/adlsgen2setup.ps1) script uploads the sample data included in the [data](./data) folder to a Data Lake Storage Gen2 storage account. The [Storage Blob Data Owner](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control-model#role-based-access-control-azure-rbac) role is required to use the script.
+[Azure Data Lake Storage Gen2](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) implements an [access control model](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control) that can be used for document level access control. The [adlsgen2setup.py](/scripts/adlsgen2setup.py) script uploads the sample data included in the [data](./data) folder to a Data Lake Storage Gen2 storage account. The [Storage Blob Data Owner](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control-model#role-based-access-control-azure-rbac) role is required to use the script.
In order to use this script, an existing Data Lake Storage Gen2 storage account is required. Run `azd env set AZURE_ADLS_GEN2_STORAGE_ACCOUNT ` prior to running the script.
-To run the script, run the following command: `/scripts/adlsgen2setup.ps1`. The script performs the following steps:
+Then run the script inside your Python environment:
+
+```shell
+python /scripts/adlsgen2setup.py './data/*' --data-access-control './scripts/sampleacls.json' -v
+```
+
+The script performs the following steps:
-- Creates example [groups](https://learn.microsoft.com/entra/fundamentals/how-to-manage-groups) listed in the [sampleacls.json](../scripts/sampleacls.json) file.
+- Creates example [groups](https://learn.microsoft.com/entra/fundamentals/how-to-manage-groups) listed in the [sampleacls.json](/scripts/sampleacls.json) file.
- Creates a filesystem / container `gptkbcontainer` in the storage account.
-- Creates the directories listed in the [sampleacls.json](../scripts/sampleacls.json) file.
-- Uploads the sample PDFs referenced in the [sampleacls.json](../scripts/sampleacls.json) file into the appropriate directories.
-- [Recursively sets Access Control Lists (ACLs)](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-acl-cli) using the information from the [sampleacls.json](../scripts/sampleacls.json) file.
+- Creates the directories listed in the [sampleacls.json](/scripts/sampleacls.json) file.
+- Uploads the sample PDFs referenced in the [sampleacls.json](/scripts/sampleacls.json) file into the appropriate directories.
+- [Recursively sets Access Control Lists (ACLs)](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-acl-cli) using the information from the [sampleacls.json](/scripts/sampleacls.json) file.
In order to use the sample access control, you need to join these groups in your Microsoft Entra tenant.
@@ -233,7 +242,7 @@ Note that this optional script may not work in Codespaces if your administrator
#### Azure Data Lake Storage Gen2 Prep Docs
-Once a Data Lake Storage Gen2 storage account has been setup with sample data and access control lists, [prepdocs.py](../app/backend/prepdocs.py) can be used to automatically process PDFs in the storage account and store them with their [access control lists in the search index](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control).
+Once a Data Lake Storage Gen2 storage account has been setup with sample data and access control lists, [prepdocs.py](/app/backend/prepdocs.py) can be used to automatically process PDFs in the storage account and store them with their [access control lists in the search index](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control).
To run this script with a Data Lake Storage Gen2 account, first set the following environment variables:
diff --git a/infra/main.parameters.json b/infra/main.parameters.json
index 3575cd8d5b..a807b9abde 100644
--- a/infra/main.parameters.json
+++ b/infra/main.parameters.json
@@ -116,6 +116,9 @@
"embeddingDimensions": {
"value": "${AZURE_OPENAI_EMB_DIMENSIONS}"
},
+ "gpt4vDeploymentCapacity":{
+ "value": "${AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY=10}"
+ },
"openAiHost": {
"value": "${OPENAI_HOST=azure}"
},
diff --git a/samples/document-security/README.md b/samples/document-security/README.md
index c4c4a62b70..5f6bde0edd 100644
--- a/samples/document-security/README.md
+++ b/samples/document-security/README.md
@@ -23,21 +23,22 @@ The [azure-search-openai-demo](/) project can set up a full RAG chat app on Azur
## Table of Contents
- [Requirements](#requirements)
-- [Setting up Microsoft Entra ID Apps](#setting-up-entra-id-apps)
+- [Setting up Microsoft Entra applications](#setting-up-microsoft-entra-applications)
- [Automatic Setup](#automatic-setup)
- [Manual Setup](#manual-setup)
- - [Server App](#setting-up-the-server-app)
+ - [Server App](#server-app)
- [Client App](#client-app)
- [Configure Server App Known Client Applications](#configure-server-app-known-client-applications)
- [Testing](#testing)
- - [Troubleshooting Entra ID Setup](#troubleshooting-entra-id-setup)
+ - [Programmatic Access With Authentication](#programmatic-access-with-authentication)
+ - [Troubleshooting](#troubleshooting)
- [Adding data with document level access control](#adding-data-with-document-level-access-control)
- [Using the Add Documents API](#using-the-add-documents-api)
- [Azure Data Lake Storage Gen2 and prepdocs](#azure-data-lake-storage-gen2-setup)
-- [Environment Variables Reference](#environment-variables-reference)
- - [Authentication Behavior by Environment](#authentication-behavior-by-environment)
+- [Environment variables reference](#environment-variables-reference)
+ - [Authentication behavior by environment](#authentication-behavior-by-environment)
-This guide demonstrates how to add an optional login and document level access control system to the sample. This system can be used to restrict access to indexed data to specific users based on what [Microsoft Entra ID groups](https://learn.microsoft.com/azure/active-directory/fundamentals/how-to-manage-groups) they are a part of, or their [user object id](https://learn.microsoft.com/partner-center/find-ids-and-domain-names#find-the-user-object-id).
+This guide demonstrates how to add an optional login and document level access control system to the sample. This system can be used to restrict access to indexed data to specific users based on what [Microsoft Entra groups](https://learn.microsoft.com/entra/fundamentals/how-to-manage-groups) they are a part of, or their [user object id](https://learn.microsoft.com/partner-center/find-ids-and-domain-names#find-the-user-object-id).
![AppLoginArchitecture](/docs/images/applogincomponents.png)
@@ -45,32 +46,33 @@ This guide demonstrates how to add an optional login and document level access c
**IMPORTANT:** In order to add optional login and document level access control, you'll need the following in addition to the normal sample requirements
-- **Azure account permissions**: Your Azure account must have [permission to manage applications in Entra ID](https://learn.microsoft.com/azure/active-directory/roles/permissions-reference#cloud-application-administrator).
+- **Azure account permissions**: Your Azure account must have [permission to manage applications in Microsoft Entra](https://learn.microsoft.com/entra/identity/role-based-access-control/permissions-reference#cloud-application-administrator).
-## Setting up Entra ID Apps
+## Setting up Microsoft Entra applications
-Two Entra ID apps must be registered in order to make the optional login and document level access control system work correctly. One app is for the client UI. The client UI is implemented as a [single page application](https://learn.microsoft.com/azure/active-directory/develop/scenario-spa-app-registration). The other app is for the API server. The API server uses a [confidential client](https://learn.microsoft.com/azure/active-directory/develop/msal-client-applications) to call the [Microsoft Graph API](https://learn.microsoft.com/graph/use-the-api).
+Two Microsoft Entra applications must be registered in order to make the optional login and document level access control system work correctly. One app is for the client UI. The client UI is implemented as a [single page application](https://learn.microsoft.com/entra/identity-platform/scenario-spa-app-registration). The other app is for the API server. The API server uses a [confidential client](https://learn.microsoft.com/entra/identity-platform/msal-client-applications) to call the [Microsoft Graph API](https://learn.microsoft.com/graph/use-the-api).
### Automatic Setup
The easiest way to setup the two apps is to use the `azd` CLI. We've written scripts that will automatically create the two apps and configure them for use with the sample. To trigger the automatic setup, run the following commands:
-1. Run `azd env set AZURE_USE_AUTHENTICATION true` to enable the login UI and App Service authentication.
-1. Ensure access control is enabled on your search index. If your index doesn't exist yet, run prepdocs with `AZURE_USE_AUTHENTICATION` set to `true`. If your index already exists, run `pwsh ./scripts/manageacl.ps1 --acl-action enable_acls`.
+1. Run `azd env set AZURE_USE_AUTHENTICATION true` to enable the login UI and use App Service authentication by default.
+1. Ensure access control is enabled on your search index. If your index doesn't exist yet, run prepdocs with `AZURE_USE_AUTHENTICATION` set to `true`. If your index already exists, run `python ./scripts/manageacl.py --acl-action enable_acls`.
1. (Optional) To require access control when using the app, run `azd env set AZURE_ENFORCE_ACCESS_CONTROL true`. Authentication is always required to search on documents with access control assigned, regardless of if unauthenticated access is enabled or not.
1. (Optional) To allow authenticated users to search on documents that have no access controls assigned, even when access control is required, run `azd env set AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS true`.
1. (Optional) To allow unauthenticated users to use the app, even when access control is enforced, run `azd env set AZURE_ENABLE_UNAUTHENTICATED_ACCESS true`. `AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS` should also be set to true if you want unauthenticated users to be able to search on documents with no access control.
1. Run `azd env set AZURE_AUTH_TENANT_ID ` to set the tenant ID associated with authentication.
+1. If your auth tenant ID is different from your currently logged in tenant ID, run `azd auth login --tenant-id ` to login to the authentication tenant simultaneously.
1. Run `azd up` to deploy the app.
### Manual Setup
The following instructions explain how to setup the two apps using the Azure Portal.
-#### Setting up the Server App
+#### Server App
- Sign in to the [Azure portal](https://portal.azure.com/).
-- Select the Entra ID Service.
+- Select the Microsoft Entra ID service.
- In the left hand menu, select **Application Registrations**.
- Select **New Registration**.
- In the **Name** section, enter a meaningful application name. This name will be displayed to users of the app, for example `Azure Search OpenAI Chat API`.
@@ -78,7 +80,9 @@ The following instructions explain how to setup the two apps using the Azure Por
- Select **Register** to create the application
- In the app's registration screen, find the **Application (client) ID**.
- Run the following `azd` command to save this ID: `azd env set AZURE_SERVER_APP_ID `.
-- Entra ID supports three types of credentials to authenticate an app using the [client credentials](https://learn.microsoft.com/azure/active-directory/develop/v2-oauth2-client-creds-grant-flow): passwords (app secrets), certificates, and federated identity credentials. For a higher level of security, either [certificates](https://learn.microsoft.com/azure/active-directory/develop/howto-create-self-signed-certificate) or federated identity credentials are recommended. This sample currently uses an app secret for ease of provisioning.
+
+- Microsoft Entra supports three types of credentials to authenticate an app using the [client credentials](https://learn.microsoft.com/entra/identity-platform/v2-oauth2-client-creds-grant-flow): passwords (app secrets), certificates, and federated identity credentials. For a higher level of security, either [certificates](https://learn.microsoft.com/entra/identity-platform/howto-create-self-signed-certificate) or federated identity credentials are recommended. This sample currently uses an app secret for ease of provisioning.
+
- Select **Certificates & secrets** in the left hand menu.
- In the **Client secrets** section, select **New client secret**.
- Type a description, for example `Azure Search OpenAI Chat Key`.
@@ -90,7 +94,7 @@ The following instructions explain how to setup the two apps using the Azure Por
- Select **Delegated permissions**.
- Search for and and select `User.Read`.
- Select **Add permissions**.
-- Select **Expose an API** in the left hand menu. The server app works by using the [On Behalf Of Flow](https://learn.microsoft.com/azure/active-directory/develop/v2-oauth2-on-behalf-of-flow#protocol-diagram), which requires the server app to expose at least 1 API.
+- Select **Expose an API** in the left hand menu. The server app works by using the [On Behalf Of Flow](https://learn.microsoft.com/entra/identity-platform/v2-oauth2-on-behalf-of-flow#protocol-diagram), which requires the server app to expose at least 1 API.
- The application must define a URI to expose APIs. Select **Add** next to **Application ID URI**.
- By default, the Application ID URI is set to `api://`. Accept the default by selecting **Save**.
- Under **Scopes defined by this API**, select **Add a scope**.
@@ -103,16 +107,16 @@ The following instructions explain how to setup the two apps using the Azure Por
- For **User consent description**, type **Allow the app to access Azure Search OpenAI Chat API on your behalf**.
- Leave **State** set to **Enabled**.
- Select **Add scope** at the bottom to save the scope.
-- (Optional) Enable group claims. Include which Entra ID groups the user is part of as part of the login in the [optional claims](https://learn.microsoft.com/azure/active-directory/develop/optional-claims). The groups are used for [optional security filtering](https://learn.microsoft.com/azure/search/search-security-trimming-for-azure-search) in the search results.
+- (Optional) Enable group claims. Include which Microsoft Entra groups the user is part of as part of the login in the [optional claims](https://learn.microsoft.com/entra/identity-platform/optional-claims). The groups are used for [optional security filtering](https://learn.microsoft.com/azure/search/search-security-trimming-for-azure-search) in the search results.
- In the left hand menu, select **Token configuration**
- Under **Optional claims**, select **Add groups claim**
- - Select which [group types](https://learn.microsoft.com/azure/active-directory/hybrid/connect/how-to-connect-fed-group-claims) to include in the claim. Note that a [overage claim](https://learn.microsoft.com/azure/active-directory/develop/access-token-claims-reference#groups-overage-claim) will be emitted if the user is part of too many groups. In this case, the API server will use the [Microsoft Graph](https://learn.microsoft.com/graph/api/user-list-memberof?view=graph-rest-*0&tabs=http) to list the groups the user is part of instead of relying on the groups in the claim.
+ - Select which [group types](https://learn.microsoft.com/entra/identity/hybrid/connect/how-to-connect-fed-group-claims) to include in the claim. Note that a [overage claim](https://learn.microsoft.com/entra/identity-platform/access-token-claims-reference#groups-overage-claim) will be emitted if the user is part of too many groups. In this case, the API server will use the [Microsoft Graph](https://learn.microsoft.com/graph/api/user-list-memberof?view=graph-rest-*0&tabs=http) to list the groups the user is part of instead of relying on the groups in the claim.
- Select **Add** to save your changes
#### Client App
- Sign in to the [Azure portal](https://portal.azure.com/).
-- Select the Entra ID Service.
+- Select the Microsoft Entra ID service.
- In the left hand menu, select **Application Registrations**.
- Select **New Registration**.
- In the **Name** section, enter a meaningful application name. This name will be displayed to users of the app, for example `Azure Search OpenAI Chat Web App`.
@@ -127,7 +131,7 @@ The following instructions explain how to setup the two apps using the Azure Por
- In the left hand menu, select **Authentication**.
- Under **Implicit grant and hybrid flows**, select **ID Tokens (used for implicit and hybrid flows)**
- Select **Save**
-- In the left hand menu, select **API permissions**. You will add permission to access the **access_as_user** API on the server app. This permission is required for the [On Behalf Of Flow](https://learn.microsoft.com/azure/active-directory/develop/v2-oauth2-on-behalf-of-flow#protocol-diagram) to work.
+- In the left hand menu, select **API permissions**. You will add permission to access the **access_as_user** API on the server app. This permission is required for the [On Behalf Of Flow](https://learn.microsoft.com/entra/identity-platform/v2-oauth2-on-behalf-of-flow#protocol-diagram) to work.
- Select **Add a permission**, and then **My APIs**.
- In the list of applications, select your server application **Azure Search OpenAI Chat API**
- Ensure **Delegated permissions** is selected.
@@ -136,7 +140,7 @@ The following instructions explain how to setup the two apps using the Azure Por
#### Configure Server App Known Client Applications
-Consent from the user must be obtained for use of the client and server app. The client app can prompt the user for consent through a dialog when they log in. The server app has no ability to show a dialog for consent. Client apps can be [added to the list of known clients](https://learn.microsoft.com/azure/active-directory/develop/v2-oauth2-on-behalf-of-flow#gaining-consent-for-the-middle-tier-application) to access the server app, so a consent dialog is shown for the server app.
+Consent from the user must be obtained for use of the client and server app. The client app can prompt the user for consent through a dialog when they log in. The server app has no ability to show a dialog for consent. Client apps can be [added to the list of known clients](https://learn.microsoft.com/entra/identity-platform/v2-oauth2-on-behalf-of-flow#gaining-consent-for-the-middle-tier-application) to access the server app, so a consent dialog is shown for the server app.
- Navigate to the server app registration
- In the left hand menu, select **Manifest**
@@ -145,18 +149,34 @@ Consent from the user must be obtained for use of the client and server app. The
#### Testing
-If you are running setup for the first time, ensure you have run `azd env set AZURE_ADLS_GEN2_STORAGE_ACCOUNT ` before running `azd up`. If you do not set this environment variable, your index will not be initialized with access control support when `prepdocs` is run for the first time. To manually enable access control in your index, use the [manual setup script](#using-the-add-documents-api).
+If you are running setup for the first time, ensure you have run `azd env set AZURE_ADLS_GEN2_STORAGE_ACCOUNT ` before running `azd up`. If you do not set this environment variable, your index will not be initialized with access control support when `prepdocs` is run for the first time. To manually enable access control in your index, use the [manual setup script](#azure-data-lake-storage-gen2-setup).
-Ensure you run `azd env set AZURE_USE_AUTHENTICATION` to enable the login UI once you have setup the two Entra ID apps before you deploy or run the application. The login UI will not appear unless all [required environment variables](#environment-variables-reference) have been setup.
+Ensure you run `azd env set AZURE_USE_AUTHENTICATION` to enable the login UI once you have setup the two Microsoft Entra apps before you deploy or run the application. The login UI will not appear unless all [required environment variables](#environment-variables-reference) have been setup.
In both the chat and ask a question modes, under **Developer settings** optional **Use oid security filter** and **Use groups security filter** checkboxes will appear. The oid (User ID) filter maps to the `oids` field in the search index and the groups (Group ID) filter maps to the `groups` field in the search index. If `AZURE_ENFORCE_ACCESS_CONTROL` has been set, then both the **Use oid security filter** and **Use groups security filter** options are always enabled and cannot be disabled.
-### Troubleshooting Entra ID Setup
+#### Programmatic Access with Authentication
+
+If you want to use the chat endpoint without the UI and still use authentication, you must disable [App Service built-in authentication](https://learn.microsoft.com/azure/app-service/overview-authentication-authorization) and use only the app's MSAL-based authentication flow. Ensure the `AZURE_DISABLE_APP_SERVICES_AUTHENTICATION` environment variable is set before deploying.
+
+Get an access token that can be used for calling the chat API using the following code:
+
+```python
+from azure.identity import DefaultAzureCredential
+import os
+
+token = DefaultAzureCredential().get_token(f"api://{os.environ['AZURE_SERVER_APP_ID']}/access_as_user", tenant_id=os.getenv('AZURE_AUTH_TENANT_ID', os.getenv('AZURE_TENANT_ID')))
-- If any Entra ID apps need to be recreated, you can avoid redeploying the app by [changing the app settings in the portal](https://learn.microsoft.com/azure/app-service/configure-common?tabs=portal#configure-app-settings). Any of the [required environment variables](#environment-variables-reference) can be changed. Once the environment variables have been changed, restart the web app.
-- It's possible a consent dialog will not appear when you log into the app for the first time. If this consent dialog doesn't appear, you will be unable to use the security filters because the API server app does not have permission to read your authorization information. A consent dialog can be forced to appear by adding `"prompt": "consent"` to the `loginRequest` property in [`authentication.py`](../../app/backend/core/authentication.py)
-- It's possible that your tenant admin has placed a restriction on consent to apps with [unverified publishers](https://learn.microsoft.com/azure/active-directory/develop/publisher-verification-overview). In this case, only admins may consent to the client and server apps, and normal user accounts are unable to use the login system until the admin consents on behalf of the entire organization.
-- It's possible that your tenant admin requires [admin approval of all new apps](https://learn.microsoft.com/azure/active-directory/manage-apps/manage-consent-requests). Regardless of whether you select the delegated or admin permissions, the app will not work without tenant admin consent.
+print(token.token)
+```
+
+### Troubleshooting
+
+- If your primary tenant restricts the ability to create Entra applications, you'll need to use a separate tenant to create the Entra applications. You can create a new tenant by following [these instructions](https://learn.microsoft.com/entra/identity-platform/quickstart-create-new-tenant). Then run `azd env set AZURE_AUTH_TENANT_ID ` before running `azd up`.
+- If any Entra apps need to be recreated, you can avoid redeploying the app by [changing the app settings in the portal](https://learn.microsoft.com/azure/app-service/configure-common?tabs=portal#configure-app-settings). Any of the [required environment variables](#environment-variables-reference) can be changed. Once the environment variables have been changed, restart the web app.
+- It's possible a consent dialog will not appear when you log into the app for the first time. If this consent dialog doesn't appear, you will be unable to use the security filters because the API server app does not have permission to read your authorization information. A consent dialog can be forced to appear by adding `"prompt": "consent"` to the `loginRequest` property in [`authentication.py`](/app/backend/core/authentication.py)
+- It's possible that your tenant admin has placed a restriction on consent to apps with [unverified publishers](https://learn.microsoft.com/entra/identity-platform/publisher-verification-overview). In this case, only admins may consent to the client and server apps, and normal user accounts are unable to use the login system until the admin consents on behalf of the entire organization.
+- It's possible that your tenant admin requires [admin approval of all new apps](https://learn.microsoft.com/entra/identity/enterprise-apps/manage-consent-requests). Regardless of whether you select the delegated or admin permissions, the app will not work without tenant admin consent. See this guide for [granting consent to an app](https://learn.microsoft.com/entra/identity/enterprise-apps/grant-admin-consent?pivots=portal).
## Adding data with document level access control
@@ -167,73 +187,82 @@ The sample supports 2 main strategies for adding data with document level access
### Using the Add Documents API
-Manually enable document level access control on a search index and manually set access control values using the [manageacl.ps1](../../scripts/manageacl.ps1) script.
+Manually enable document level access control on a search index and manually set access control values using the [manageacl.py](/scripts/manageacl.py) script.
+
+Prior to running the script:
-Run `azd up` or use `azd env set` to manually set `AZURE_SEARCH_SERVICE` and `AZURE_SEARCH_INDEX` environment variables prior to running the script.
+- Run `azd up` or use `azd env set` to manually set the `AZURE_SEARCH_SERVICE` and `AZURE_SEARCH_INDEX` azd environment variables
+- Activate the Python virtual environment for your shell session
-The script supports the following commands. Note that the syntax is the same regardless of whether [manageacl.ps1](../../scripts/manageacl.ps1) or [manageacl.sh](../../scripts/manageacl.sh) is used. All commands support `-v` for verbose logging.
+The script supports the following commands. All commands support `-v` for verbose logging.
-- `./scripts/manageacl.ps1 --acl-action enable_acls`: Creates the required `oids` (User ID) and `groups` (Group IDs) [security filter](https://learn.microsoft.com/azure/search/search-security-trimming-for-azure-search) fields for document level access control on your index, as well as the `storageUrl` field for storing the Blob storage URL. Does nothing if these fields already exist.
+- `python ./scripts/manageacl.py --acl-action enable_acls`: Creates the required `oids` (User ID) and `groups` (Group IDs) [security filter](https://learn.microsoft.com/azure/search/search-security-trimming-for-azure-search) fields for document level access control on your index, as well as the `storageUrl` field for storing the Blob storage URL. Does nothing if these fields already exist.
Example usage:
```shell
- ./scripts/manageacl.ps1 -v --acl-action enable_acls
+ python ./scripts/manageacl.py -v --acl-action enable_acls
```
-- `./scripts/manageacl.ps1 --acl-type [oids or groups]--acl-action view --url [https://url.pdf]`: Prints access control values associated with either User IDs or Group IDs for the document at the specified URL.
+- `python ./scripts/manageacl.py --acl-type [oids or groups]--acl-action view --url [https://url.pdf]`: Prints access control values associated with either User IDs or Group IDs for the document at the specified URL.
Example to view all Group IDs:
```shell
- ./scripts/manageacl.ps1 -v --acl-type groups --acl-action view --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
+ python ./scripts/manageacl.py -v --acl-type groups --acl-action view --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
```
-- `./scripts/manageacl.ps1 --url [https://url.pdf] --acl-type [oids or groups]--acl-action add --acl [ID of group or user]`: Adds an access control value associated with either User IDs or Group IDs for the document at the specified URL.
+- `python ./scripts/manageacl.py --acl-type [oids or groups]--acl-action add --acl [ID of group or user] --url [https://url.pdf]`: Adds an access control value associated with either User IDs or Group IDs for the document at the specified URL.
Example to add a Group ID:
```shell
- ./scripts/manageacl.ps1 -v --acl-type groups --acl-action add --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
+ python ./scripts/manageacl.py -v --acl-type groups --acl-action add --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
```
-- `./scripts/manageacl.ps1 --url [https://url.pdf] --acl-type [oids or groups]--acl-action remove_all`: Removes all access control values associated with either User IDs or Group IDs for a specific document.
+- `python ./scripts/manageacl.py --acl-type [oids or groups]--acl-action remove_all --url [https://url.pdf]`: Removes all access control values associated with either User IDs or Group IDs for a specific document.
Example to remove all Group IDs:
```shell
- ./scripts/manageacl.ps1 -v --acl-type groups --acl-action remove_all --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
+ python ./scripts/manageacl.py -v --acl-type groups --acl-action remove_all --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
```
-- `./scripts/manageacl.ps1 --url [https://url.pdf] --acl-type [oids or groups]--acl-action remove --acl [ID of group or user]`: Removes an access control value associated with either User IDs or Group IDs for a specific document.
+- `python ./scripts/manageacl.py --url [https://url.pdf] --acl-type [oids or groups]--acl-action remove --acl [ID of group or user]`: Removes an access control value associated with either User IDs or Group IDs for a specific document.
Example to remove a specific User ID:
```shell
- ./scripts/manageacl.ps1 -v --acl-type oids --acl-action remove --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
+ python ./scripts/manageacl.py -v --acl-type oids --acl-action remove --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
```
### Azure Data Lake Storage Gen2 Setup
-[Azure Data Lake Storage Gen2](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) implements an [access control model](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control) that can be used for document level access control. The [adlsgen2setup.ps1](../../scripts/adlsgen2setup.ps1) script uploads the sample data included in the [data](./data) folder to a Data Lake Storage Gen2 storage account. The [Storage Blob Data Owner](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control-model#role-based-access-control-azure-rbac) role is required to use the script.
+[Azure Data Lake Storage Gen2](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) implements an [access control model](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control) that can be used for document level access control. The [adlsgen2setup.py](/scripts/adlsgen2setup.py) script uploads the sample data included in the [data](./data) folder to a Data Lake Storage Gen2 storage account. The [Storage Blob Data Owner](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control-model#role-based-access-control-azure-rbac) role is required to use the script.
In order to use this script, an existing Data Lake Storage Gen2 storage account is required. Run `azd env set AZURE_ADLS_GEN2_STORAGE_ACCOUNT ` prior to running the script.
-To run the script, run the following command: `/scripts/adlsgen2setup.ps1`. The script performs the following steps:
+Then run the script inside your Python environment:
+
+```shell
+python /scripts/adlsgen2setup.py './data/*' --data-access-control './scripts/sampleacls.json' -v
+```
+
+The script performs the following steps:
-- Creates example [groups](https://learn.microsoft.com/azure/active-directory/fundamentals/how-to-manage-groups) listed in the [sampleacls.json](../../scripts/sampleacls.json) file.
+- Creates example [groups](https://learn.microsoft.com/entra/fundamentals/how-to-manage-groups) listed in the [sampleacls.json](/scripts/sampleacls.json) file.
- Creates a filesystem / container `gptkbcontainer` in the storage account.
-- Creates the directories listed in the [sampleacls.json](../../scripts/sampleacls.json) file.
-- Uploads the sample PDFs referenced in the [sampleacls.json](../../scripts/sampleacls.json) file into the appropriate directories.
-- [Recursively sets Access Control Lists (ACLs)](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-acl-cli) using the information from the [sampleacls.json](../../scripts/sampleacls.json) file.
+- Creates the directories listed in the [sampleacls.json](/scripts/sampleacls.json) file.
+- Uploads the sample PDFs referenced in the [sampleacls.json](/scripts/sampleacls.json) file into the appropriate directories.
+- [Recursively sets Access Control Lists (ACLs)](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-acl-cli) using the information from the [sampleacls.json](/scripts/sampleacls.json) file.
-In order to use the sample access control, you need to join these groups in your Entra ID tenant.
+In order to use the sample access control, you need to join these groups in your Microsoft Entra tenant.
-Note that this optional script may not work in Codespaces if your administrator has applied a [Conditional Access policy](https://learn.microsoft.com/azure/active-directory/conditional-access/overview) to your tenant.
+Note that this optional script may not work in Codespaces if your administrator has applied a [Conditional Access policy](https://learn.microsoft.com/entra/identity/conditional-access/overview) to your tenant.
#### Azure Data Lake Storage Gen2 Prep Docs
-Once a Data Lake Storage Gen2 storage account has been setup with sample data and access control lists, [prepdocs.py](../../app/backend/prepdocs.py) can be used to automatically process PDFs in the storage account and store them with their [access control lists in the search index](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control).
+Once a Data Lake Storage Gen2 storage account has been setup with sample data and access control lists, [prepdocs.py](/app/backend/prepdocs.py) can be used to automatically process PDFs in the storage account and store them with their [access control lists in the search index](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control).
To run this script with a Data Lake Storage Gen2 account, first set the following environment variables:
@@ -243,18 +272,19 @@ To run this script with a Data Lake Storage Gen2 account, first set the followin
Once the environment variables are set, run the script using the following command: `/scripts/prepdocs.ps1` or `/scripts/prepdocs.sh`.
-## Environment Variables Reference
+## Environment variables reference
The following environment variables are used to setup the optional login and document level access control:
-- `AZURE_USE_AUTHENTICATION`: Enables Entra ID based optional login and document level access control. Set to true before running `azd up`.
+- `AZURE_USE_AUTHENTICATION`: Enables Entra ID login and document level access control. Set to true before running `azd up`.
- `AZURE_ENFORCE_ACCESS_CONTROL`: Enforces Entra ID based login and document level access control on documents with access control assigned. Set to true before running `azd up`. If `AZURE_ENFORCE_ACCESS_CONTROL` is enabled and `AZURE_ENABLE_UNAUTHENTICATED_ACCESS` is not enabled, then authentication is required to use the app.
- `AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS`: Allows users to search on documents that have no access controls assigned
- `AZURE_ENABLE_UNAUTHENTICATED_ACCESS`: Allows unauthenticated users to access the chat app, even when `AZURE_ENFORCE_ACCESS_CONTROL` is enabled. `AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS` should be set to true to allow unauthenticated users to search on documents that have no access control assigned. Unauthenticated users cannot search on documents with access control assigned.
-- `AZURE_SERVER_APP_ID`: (Required) Application ID of the Entra ID app for the API server.
-- `AZURE_SERVER_APP_SECRET`: [Client secret](https://learn.microsoft.com/azure/active-directory/develop/v2-oauth2-client-creds-grant-flow) used by the API server to authenticate using the Entra ID API server app.
-- `AZURE_CLIENT_APP_ID`: Application ID of the Entra ID app for the client UI.
-- `AZURE_AUTH_TENANT_ID`: [Tenant ID](https://learn.microsoft.com/azure/active-directory/fundamentals/how-to-find-tenant) associated with the Entra ID used for login and document level access control. Defaults to `AZURE_TENANT_ID` if not defined.
+- `AZURE_DISABLE_APP_SERVICES_AUTHENTICATION`: Disables [use of built-in authentication for App Services](https://learn.microsoft.com/azure/app-service/overview-authentication-authorization). An authentication flow based on the MSAL SDKs is used instead. Useful when you want to provide programmatic access to the chat endpoints with authentication.
+- `AZURE_SERVER_APP_ID`: (Required) Application ID of the Microsoft Entra app for the API server.
+- `AZURE_SERVER_APP_SECRET`: [Client secret](https://learn.microsoft.com/entra/identity-platform/v2-oauth2-client-creds-grant-flow) used by the API server to authenticate using the Microsoft Entra server app.
+- `AZURE_CLIENT_APP_ID`: Application ID of the Microsoft Entra app for the client UI.
+- `AZURE_AUTH_TENANT_ID`: [Tenant ID](https://learn.microsoft.com/entra/fundamentals/how-to-find-tenant) associated with the Microsoft Entra tenant used for login and document level access control. Defaults to `AZURE_TENANT_ID` if not defined.
- `AZURE_ADLS_GEN2_STORAGE_ACCOUNT`: (Optional) Name of existing [Data Lake Storage Gen2 storage account](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) for storing sample data with [access control lists](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control). Only used with the optional Data Lake Storage Gen2 [setup](#azure-data-lake-storage-gen2-setup) and [prep docs](#azure-data-lake-storage-gen2-prep-docs) scripts.
- `AZURE_ADLS_GEN2_FILESYSTEM`: (Optional) Name of existing [Data Lake Storage Gen2 filesystem](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) for storing sample data with [access control lists](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control). Only used with the optional Data Lake Storage Gen2 [setup](#azure-data-lake-storage-gen2-setup) and [prep docs](#azure-data-lake-storage-gen2-prep-docs) scripts.
- `AZURE_ADLS_GEN2_FILESYSTEM_PATH`: (Optional) Name of existing path in a [Data Lake Storage Gen2 filesystem](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) for storing sample data with [access control lists](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control). Only used with the optional Data Lake Storage Gen2 [prep docs](#azure-data-lake-storage-gen2-prep-docs) script.
diff --git a/scripts/adlsgen2setup.ps1 b/scripts/adlsgen2setup.ps1
deleted file mode 100644
index e6b80c0d46..0000000000
--- a/scripts/adlsgen2setup.ps1
+++ /dev/null
@@ -1,19 +0,0 @@
-## Set the preference to stop on the first error
-$ErrorActionPreference = "Stop"
-
-& $PSScriptRoot\loadenv.ps1
-
-$venvPythonPath = "./.venv/scripts/python.exe"
-if (Test-Path -Path "/usr") {
- # fallback to Linux venv path
- $venvPythonPath = "./.venv/bin/python"
-}
-
-if ([string]::IsNullOrEmpty($env:AZURE_ADLS_GEN2_STORAGE_ACCOUNT)) {
- Write-Error "AZURE_ADLS_GEN2_STORAGE_ACCOUNT must be set in order to continue"
- exit 1
-}
-
-Write-Host 'Running "adlsgen2setup.py"'
-$cwd = (Get-Location)
-Start-Process -FilePath $venvPythonPath -ArgumentList "./scripts/adlsgen2setup.py `"$cwd/data`" --data-access-control ./scripts/sampleacls.json --storage-account $env:AZURE_ADLS_GEN2_STORAGE_ACCOUNT -v" -Wait -NoNewWindow
diff --git a/scripts/adlsgen2setup.py b/scripts/adlsgen2setup.py
index 98fd265c6f..1deccdf199 100644
--- a/scripts/adlsgen2setup.py
+++ b/scripts/adlsgen2setup.py
@@ -13,6 +13,10 @@
DataLakeServiceClient,
)
+from load_azd_env import load_azd_env
+
+logger = logging.getLogger("scripts")
+
class AdlsGen2Setup:
"""
@@ -54,18 +58,18 @@ def __init__(
async def run(self):
async with self.create_service_client() as service_client:
- logging.info(f"Ensuring {self.filesystem_name} exists...")
+ logger.info(f"Ensuring {self.filesystem_name} exists...")
async with service_client.get_file_system_client(self.filesystem_name) as filesystem_client:
if not await filesystem_client.exists():
await filesystem_client.create_file_system()
- logging.info("Creating groups...")
+ logger.info("Creating groups...")
groups: dict[str, str] = {}
for group in self.data_access_control_format["groups"]:
group_id = await self.create_or_get_group(group)
groups[group] = group_id
- logging.info("Ensuring directories exist...")
+ logger.info("Ensuring directories exist...")
directories: dict[str, DataLakeDirectoryClient] = {}
try:
for directory in self.data_access_control_format["directories"].keys():
@@ -76,23 +80,23 @@ async def run(self):
)
directories[directory] = directory_client
- logging.info("Uploading files...")
+ logger.info("Uploading files...")
for file, file_info in self.data_access_control_format["files"].items():
directory = file_info["directory"]
if directory not in directories:
- logging.error(f"File {file} has unknown directory {directory}, exiting...")
+ logger.error(f"File {file} has unknown directory {directory}, exiting...")
return
await self.upload_file(
directory_client=directories[directory], file_path=os.path.join(self.data_directory, file)
)
- logging.info("Setting access control...")
+ logger.info("Setting access control...")
for directory, access_control in self.data_access_control_format["directories"].items():
directory_client = directories[directory]
if "groups" in access_control:
for group_name in access_control["groups"]:
if group_name not in groups:
- logging.error(
+ logger.error(
f"Directory {directory} has unknown group {group_name} in access control list, exiting"
)
return
@@ -122,7 +126,7 @@ async def create_or_get_group(self, group_name: str):
token_result = await self.credentials.get_token("https://graph.microsoft.com/.default")
self.graph_headers = {"Authorization": f"Bearer {token_result.token}"}
async with aiohttp.ClientSession(headers=self.graph_headers) as session:
- logging.info(f"Searching for group {group_name}...")
+ logger.info(f"Searching for group {group_name}...")
async with session.get(
f"https://graph.microsoft.com/v1.0/groups?$select=id&$top=1&$filter=displayName eq '{group_name}'"
) as response:
@@ -132,7 +136,7 @@ async def create_or_get_group(self, group_name: str):
if len(content["value"]) == 1:
group_id = content["value"][0]["id"]
if not group_id:
- logging.info(f"Could not find group {group_name}, creating...")
+ logger.info(f"Could not find group {group_name}, creating...")
group = {
"displayName": group_name,
"securityEnabled": self.security_enabled_groups,
@@ -146,17 +150,22 @@ async def create_or_get_group(self, group_name: str):
if response.status != 201:
raise Exception(content)
group_id = content["id"]
- logging.info(f"Group {group_name} ID {group_id}")
+ logger.info(f"Group {group_name} ID {group_id}")
return group_id
async def main(args: Any):
+ load_azd_env()
+
+ if not os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT"):
+ raise Exception("AZURE_ADLS_GEN2_STORAGE_ACCOUNT must be set to continue")
+
async with AzureDeveloperCliCredential() as credentials:
with open(args.data_access_control) as f:
data_access_control_format = json.load(f)
command = AdlsGen2Setup(
data_directory=args.data_directory,
- storage_account_name=args.storage_account,
+ storage_account_name=os.environ["AZURE_ADLS_GEN2_STORAGE_ACCOUNT"],
filesystem_name="gptkbcontainer",
security_enabled_groups=args.create_security_enabled_groups,
credentials=credentials,
@@ -168,14 +177,9 @@ async def main(args: Any):
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Upload sample data to a Data Lake Storage Gen2 account and associate sample access control lists with it using sample groups",
- epilog="Example: ./scripts/adlsgen2setup.py ./data --data-access-control ./scripts/sampleacls.json --storage-account --create-security-enabled-groups ",
+ epilog="Example: ./scripts/adlsgen2setup.py ./data --data-access-control ./scripts/sampleacls.json --create-security-enabled-groups ",
)
parser.add_argument("data_directory", help="Data directory that contains sample PDFs")
- parser.add_argument(
- "--storage-account",
- required=True,
- help="Name of the Data Lake Storage Gen2 account to upload the sample data to",
- )
parser.add_argument(
"--create-security-enabled-groups",
required=False,
diff --git a/scripts/adlsgen2setup.sh b/scripts/adlsgen2setup.sh
deleted file mode 100755
index 119379212c..0000000000
--- a/scripts/adlsgen2setup.sh
+++ /dev/null
@@ -1,12 +0,0 @@
- #!/bin/sh
-
-. ./scripts/loadenv.sh
-
-if [ -z "$AZURE_ADLS_GEN2_STORAGE_ACCOUNT" ]; then
- echo 'AZURE_ADLS_GEN2_STORAGE_ACCOUNT must be set to continue'
- exit 1
-fi
-
-echo 'Running "adlsgen2setup.py"'
-
-./.venv/bin/python ./scripts/adlsgen2setup.py './data/*' --data-access-control './scripts/sampleacls.json' --storage-account "$AZURE_ADLS_GEN2_STORAGE_ACCOUNT" -v
diff --git a/scripts/auth_init.ps1 b/scripts/auth_init.ps1
index 872cf5ab6e..c16cb5201b 100755
--- a/scripts/auth_init.ps1
+++ b/scripts/auth_init.ps1
@@ -1,6 +1,8 @@
-. ./scripts/load_azd_env.ps1
+Write-Host "Checking if authentication should be setup..."
-if (-not $env:AZURE_USE_AUTHENTICATION) {
+$AZURE_USE_AUTHENTICATION = (azd env get-value AZURE_USE_AUTHENTICATION)
+if ($AZURE_USE_AUTHENTICATION -ne "true") {
+ Write-Host "AZURE_USE_AUTHENTICATION is not set, skipping authentication setup."
Exit 0
}
diff --git a/scripts/auth_init.py b/scripts/auth_init.py
index e638f40f73..f024b70751 100644
--- a/scripts/auth_init.py
+++ b/scripts/auth_init.py
@@ -22,6 +22,7 @@
from msgraph.generated.models.web_application import WebApplication
from auth_common import get_application, test_authentication_enabled
+from load_azd_env import load_azd_env
async def create_application(graph_client: GraphServiceClient, request_app: Application) -> Tuple[str, str]:
@@ -165,11 +166,18 @@ def server_app_known_client_application(client_app_id: str) -> Application:
async def main():
+ load_azd_env()
+
if not test_authentication_enabled():
print("Not setting up authentication.")
exit(0)
- auth_tenant = os.getenv("AZURE_AUTH_TENANT_ID", os.environ["AZURE_TENANT_ID"])
+ auth_tenant = os.getenv("AZURE_AUTH_TENANT_ID", os.getenv("AZURE_TENANT_ID"))
+ if not auth_tenant:
+ print(
+ "Error: No tenant ID set for authentication. Run `azd env set AZURE_AUTH_TENANT_ID tenant-id` to set the tenant ID."
+ )
+ exit(1)
print("Setting up authentication for tenant", auth_tenant)
credential = AzureDeveloperCliCredential(tenant_id=auth_tenant)
diff --git a/scripts/auth_init.sh b/scripts/auth_init.sh
index bd7cfff552..dfe0efe620 100755
--- a/scripts/auth_init.sh
+++ b/scripts/auth_init.sh
@@ -2,9 +2,8 @@
echo "Checking if authentication should be setup..."
-. ./scripts/load_azd_env.sh
-
-if [ -z "$AZURE_USE_AUTHENTICATION" ]; then
+AZURE_USE_AUTHENTICATION=$(azd env get-value AZURE_USE_AUTHENTICATION)
+if [ "$AZURE_USE_AUTHENTICATION" != "true" ]; then
echo "AZURE_USE_AUTHENTICATION is not set, skipping authentication setup."
exit 0
fi
diff --git a/scripts/auth_update.ps1 b/scripts/auth_update.ps1
index 1dbf7efee2..37f2392acc 100644
--- a/scripts/auth_update.ps1
+++ b/scripts/auth_update.ps1
@@ -1,6 +1,5 @@
-. ./scripts/load_azd_env.ps1
-
-if (-not $env:AZURE_USE_AUTHENTICATION) {
+$AZURE_USE_AUTHENTICATION = (azd env get-value AZURE_USE_AUTHENTICATION)
+if ($AZURE_USE_AUTHENTICATION -ne "true") {
Exit 0
}
diff --git a/scripts/auth_update.sh b/scripts/auth_update.sh
index 7b64995f75..31635a237d 100755
--- a/scripts/auth_update.sh
+++ b/scripts/auth_update.sh
@@ -1,8 +1,7 @@
#!/bin/sh
-. ./scripts/load_azd_env.sh
-
-if [ -z "$AZURE_USE_AUTHENTICATION" ]; then
+AZURE_USE_AUTHENTICATION=$(azd env get-value AZURE_USE_AUTHENTICATION)
+if [ "$AZURE_USE_AUTHENTICATION" != "true" ]; then
exit 0
fi
diff --git a/scripts/load_azd_env.ps1 b/scripts/load_azd_env.ps1
deleted file mode 100644
index 9f59bc7b07..0000000000
--- a/scripts/load_azd_env.ps1
+++ /dev/null
@@ -1,8 +0,0 @@
-Write-Host "Loading azd .env file from current environment"
-foreach ($line in (& azd env get-values)) {
- if ($line -match "([^=]+)=(.*)") {
- $key = $matches[1]
- $value = $matches[2] -replace '^"|"$'
- [Environment]::SetEnvironmentVariable($key, $value)
- }
-}
diff --git a/scripts/load_azd_env.py b/scripts/load_azd_env.py
new file mode 100644
index 0000000000..5a6334ab6f
--- /dev/null
+++ b/scripts/load_azd_env.py
@@ -0,0 +1,23 @@
+import json
+import logging
+import subprocess
+
+from dotenv import load_dotenv
+
+logger = logging.getLogger("scripts")
+
+
+def load_azd_env():
+ """Get path to current azd env file and load file using python-dotenv"""
+ result = subprocess.run("azd env list -o json", shell=True, capture_output=True, text=True)
+ if result.returncode != 0:
+ raise Exception("Error loading azd env")
+ env_json = json.loads(result.stdout)
+ env_file_path = None
+ for entry in env_json:
+ if entry["IsDefault"]:
+ env_file_path = entry["DotEnvPath"]
+ if not env_file_path:
+ raise Exception("No default azd env file found")
+ logger.info(f"Loading azd env from {env_file_path}")
+ load_dotenv(env_file_path, override=True)
diff --git a/scripts/load_azd_env.sh b/scripts/load_azd_env.sh
deleted file mode 100755
index 02926243a0..0000000000
--- a/scripts/load_azd_env.sh
+++ /dev/null
@@ -1,10 +0,0 @@
- #!/bin/sh
-
-echo "Loading azd .env file from current environment..."
-
-while IFS='=' read -r key value; do
- value=$(echo "$value" | sed 's/^"//' | sed 's/"$//')
- export "$key=$value"
-done <