diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py index c97ca41939..14675266f7 100644 --- a/app/backend/prepdocslib/mediadescriber.py +++ b/app/backend/prepdocslib/mediadescriber.py @@ -105,4 +105,4 @@ async def describe_image(self, image_bytes) -> str: results = await self.poll_api(session, poll_url, headers) fields = results["result"]["contents"][0]["fields"] - return fields["DescriptionHTML"]["valueString"] + return fields["Description"]["valueString"] diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index e0e809d384..7678a55ccb 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -138,17 +138,19 @@ class ObjectType(Enum): added_objects = set() # set of object types todo mypy for idx, mask_char in enumerate(mask_chars): object_type, object_idx = mask_char - if object_idx is None: - raise ValueError("object_idx should not be None") if object_type == ObjectType.NONE: page_text += form_recognizer_results.content[page_offset + idx] elif object_type == ObjectType.TABLE: + if object_idx is None: + raise ValueError("Expected object_idx to be set") if mask_char not in added_objects: page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx]) added_objects.add(mask_char) elif object_type == ObjectType.FIGURE: if cu_describer is None: raise ValueError("cu_describer should not be None, unable to describe figure") + if object_idx is None: + raise ValueError("Expected object_idx to be set") if mask_char not in added_objects: figure_html = await DocumentAnalysisParser.figure_to_html( doc_for_pymupdf, cu_describer, figures_on_page[object_idx] @@ -176,7 +178,7 @@ async def figure_to_html( doc: pymupdf.Document, cu_describer: ContentUnderstandingDescriber, figure: DocumentFigure ) -> str: figure_title = (figure.caption and figure.caption.content) or "" - logger.info("Describing figure '%s' with title", figure.id, figure_title) + logger.info("Describing figure %s with title '%s'", figure.id, figure_title) if not figure.bounding_regions: return f"
{figure_title}
" for region in figure.bounding_regions: diff --git a/infra/main.bicep b/infra/main.bicep index 1e467f152a..5c181cd525 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -412,6 +412,7 @@ var appEnvVariables = { USE_LOCAL_PDF_PARSER: useLocalPdfParser USE_LOCAL_HTML_PARSER: useLocalHtmlParser USE_MEDIA_DESCRIBER_AZURE_CU: useMediaDescriberAzureCU + AZURE_CONTENTUNDERSTANDING_ENDPOINT: useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : '' RUNNING_IN_PRODUCTION: 'true' } @@ -1193,6 +1194,7 @@ output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech.outputs.re output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech.outputs.location : '' output AZURE_VISION_ENDPOINT string = useGPT4V ? computerVision.outputs.endpoint : '' +output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : '' output AZURE_DOCUMENTINTELLIGENCE_SERVICE string = documentIntelligence.outputs.name output AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP string = documentIntelligenceResourceGroup.name diff --git a/scripts/prepdocs.sh b/scripts/prepdocs.sh index 68e9a38cf4..c0254755e0 100755 --- a/scripts/prepdocs.sh +++ b/scripts/prepdocs.sh @@ -9,4 +9,4 @@ if [ $# -gt 0 ]; then additionalArgs="$@" fi -./.venv/bin/python ./app/backend/prepdocs.py './data/GPT4V_Examples/Financial Market Analysis Report 2023.pdf' --verbose $additionalArgs +./.venv/bin/python ./app/backend/prepdocs.py './data/*' --verbose $additionalArgs diff --git a/tests/conftest.py b/tests/conftest.py index cfc5326f31..6b4ad7c50c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,6 +34,7 @@ MockBlobClient, MockResponse, mock_computervision_response, + mock_contentunderstanding_response, mock_speak_text_cancelled, mock_speak_text_failed, mock_speak_text_success, @@ -54,10 +55,12 @@ async def mock_search(self, *args, **kwargs): @pytest.fixture -def mock_compute_embeddings_call(monkeypatch): +def mock_azurehttp_calls(monkeypatch): def mock_post(*args, **kwargs): if kwargs.get("url").endswith("computervision/retrieval:vectorizeText"): return mock_computervision_response() + elif kwargs.get("url").endswith("/contentunderstanding/analyzers/image_analyzer:analyze"): + return mock_contentunderstanding_response() else: raise Exception("Unexpected URL for mock call to ClientSession.post()") @@ -327,7 +330,7 @@ async def client( mock_openai_embedding, mock_acs_search, mock_blob_container_client, - mock_compute_embeddings_call, + mock_azurehttp_calls, ): quart_app = app.create_app() @@ -346,7 +349,7 @@ async def client_with_expiring_token( mock_openai_embedding, mock_acs_search, mock_blob_container_client, - mock_compute_embeddings_call, + mock_azurehttp_calls, ): quart_app = app.create_app() diff --git a/tests/mocks.py b/tests/mocks.py index 13dc82ac6e..f8de506d8f 100644 --- a/tests/mocks.py +++ b/tests/mocks.py @@ -203,6 +203,15 @@ def mock_computervision_response(): ) +def mock_contentunderstanding_response(): + return MockResponse( + status=200, + headers={ + "Operation-Location": "https://cu-ztmfrxlgtk3nq.cognitiveservices.azure.com/contentunderstanding/analyzers/image_analyzer/results/53e4c016-d2c0-48a9-a9f4-38891f7d45f0?api-version=2024-12-01-preview" + }, + ) + + class MockAudio: def __init__(self, audio_data): self.audio_data = audio_data