From e34e035796398e1287a1afe5dc71be58dbb812f7 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Thu, 30 May 2024 14:50:50 +0200 Subject: [PATCH] remove old notebooks, update intro texts Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- .../qa_doc_collection/doc_collection_qa.ipynb | 998 ---------- .../README.md | 7 +- .../qa_deep_dive.ipynb | 56 +- .../qa_quick_start.ipynb | 8 +- examples/qa_single_doc/README.md | 22 - examples/qa_single_doc/single_doc_qa.ipynb | 1675 ----------------- 6 files changed, 36 insertions(+), 2730 deletions(-) delete mode 100644 examples/qa_doc_collection/doc_collection_qa.ipynb rename examples/{qa_doc_collection => qa_rag_semantic_retrieval}/README.md (68%) delete mode 100644 examples/qa_single_doc/README.md delete mode 100644 examples/qa_single_doc/single_doc_qa.ipynb diff --git a/examples/qa_doc_collection/doc_collection_qa.ipynb b/examples/qa_doc_collection/doc_collection_qa.ipynb deleted file mode 100644 index c1150b6..0000000 --- a/examples/qa_doc_collection/doc_collection_qa.ipynb +++ /dev/null @@ -1,998 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "99f717ef-4cba-4300-b258-0b1c248cb873", - "metadata": {}, - "source": [ - "# RAG and Semantic Retrieval on a Document Collection\n", - "\n", - "Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answer your questions using the information in the corpus.\n", - "\n", - "In this example we demonstrate how achive the same interaction programmatically.\n", - "\n", - "### Access required\n", - "\n", - "The content of this notebook requires access to Deep Search capabilities which are not\n", - "available on the public access system.\n", - "\n", - "[Contact us](https://ds4sd.github.io) if you are interested in exploring\n", - "these Deep Search capabilities.\n", - "\n", - "\n", - "### GenAI Integration required\n", - "\n", - "When interacting with the virtual assistant, Deep Search requires a connection to a Generative AI API. Currently, we support connections to [watsonx.ai](https://www.ibm.com/products/watsonx-ai) or the IBM-internal GenAI platform BAM.\n", - "\n", - "Deep Search allows custom GenAI configurations for each project.\n", - "In the following example you will require to work in a project which has such GenAI capabilities activated." - ] - }, - { - "cell_type": "markdown", - "id": "256aef50-71a1-4278-9b22-17cb99a6566e", - "metadata": {}, - "source": [ - "### Set notebook parameters\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "5b244bdd-1b52-41ff-b63e-9a203570d210", - "metadata": {}, - "outputs": [], - "source": [ - "from dsnotebooks.settings import CollQANotebookSettings\n", - "\n", - "# notebooks settings auto-loaded from .env / env vars\n", - "notebook_settings = CollQANotebookSettings()\n", - "\n", - "PROFILE_NAME = notebook_settings.profile # the profile to use\n", - "PROJ_KEY = notebook_settings.proj_key # the project to use\n", - "INDEX_KEY = notebook_settings.sem_on_idx_key # the collection to use\n", - "\n", - "SKIP_INGESTED_DOCS = (\n", - " notebook_settings.skip_ingested_docs\n", - ") # whether to skip any already semantically ingested docs\n", - "\n", - "RETR_K = notebook_settings.retr_k # the number of search results to retrieve\n", - "TEXT_WEIGHT = (\n", - " notebook_settings.text_weight\n", - ") # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n", - "RERANK = notebook_settings.rerank # whether to rerank the search results\n", - "RAISE = (\n", - " notebook_settings.raise_on_sem_err\n", - ") # whether semantic operation errors should raise an exception or be reflected in response fields" - ] - }, - { - "cell_type": "markdown", - "id": "a5269060-bb5f-4fe3-9b64-547202db6714", - "metadata": {}, - "source": [ - "### Import example dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5d236ea0-db1c-4171-8e11-cdd0bad69d66", - "metadata": {}, - "outputs": [], - "source": [ - "# Import standard dependenices\n", - "import pandas as pd\n", - "import rich\n", - "\n", - "# IPython utilities\n", - "from IPython.display import display, Markdown\n", - "\n", - "# Import the deepsearch-toolkit\n", - "from deepsearch.cps.client.api import CpsApi\n", - "from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource\n", - "from deepsearch.cps.queries import DataQuery, RAGQuery, SemanticQuery\n", - "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem" - ] - }, - { - "cell_type": "markdown", - "id": "293c249b-6018-46f2-b4d8-795f994d4729", - "metadata": {}, - "source": [ - "### Connect to Deep Search" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9c108432-a285-4c7b-a996-008ac3ff3d7a", - "metadata": {}, - "outputs": [], - "source": [ - "api = CpsApi.from_env(profile_name=PROFILE_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Utils" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def render_provenance_url(\n", - " api: CpsApi,\n", - " coords: ElasticProjectDataCollectionSource,\n", - " retr_item: SearchResultItem,\n", - "):\n", - " ## compute URL to the document in the Deep Search UI\n", - " item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\") + 1 :])\n", - " doc_url = api.documents.generate_url(\n", - " document_hash=retr_item.doc_hash,\n", - " data_source=coords,\n", - " item_index=item_index,\n", - " )\n", - " display(\n", - " Markdown(\n", - " f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"\n", - " )\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "38cde869-46d1-4833-8eb3-2381b5e5fb68", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Prepare the collection coordinates:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "coll_coords = ElasticProjectDataCollectionSource(\n", - " proj_key=PROJ_KEY,\n", - " index_key=INDEX_KEY,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are using a small collection, so we can just list its documents to get an idea of its contents (for more details on querying, check the [Data Query Quick Start](https://github.com/DS4SD/deepsearch-examples/tree/main/examples/data_query_quick_start))." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "2b38875e-f39c-4dd5-9d42-3ffca5d0bdac", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Finished fetching all data. Total is 10 records.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FilenameDocHash
0natural-language-processing.pdf000f892ddcc67f165797a96e94f44fb9e0697c7912a383...
1ibm-z.pdf07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d...
2ibm.pdf234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997...
3ibm-the-great-mind-challenge.pdf335120a57b418655196e3315b562a2f9e89cedeaef9318...
4turing-award.pdf8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272...
5ibm-research.pdfb30bc667a324ae111d025526563b674a8d3fd869bc07c8...
6artificial-intelligence.pdfb60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b...
7machine-learning.pdfe470e7b42a92c8e5f25094362361947b9203e0074c2223...
8deep-blue-chess-computer.pdffa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8...
9red-hat.pdffb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87...
\n", - "
" - ], - "text/plain": [ - " Filename \\\n", - "0 natural-language-processing.pdf \n", - "1 ibm-z.pdf \n", - "2 ibm.pdf \n", - "3 ibm-the-great-mind-challenge.pdf \n", - "4 turing-award.pdf \n", - "5 ibm-research.pdf \n", - "6 artificial-intelligence.pdf \n", - "7 machine-learning.pdf \n", - "8 deep-blue-chess-computer.pdf \n", - "9 red-hat.pdf \n", - "\n", - " DocHash \n", - "0 000f892ddcc67f165797a96e94f44fb9e0697c7912a383... \n", - "1 07e56eb5a10f725fccad9386d126b7b05bec1fa71b9b3d... \n", - "2 234bc5cf2c860d49574b0ff7191c354b7bbc11472a0997... \n", - "3 335120a57b418655196e3315b562a2f9e89cedeaef9318... \n", - "4 8a7c91a269abc3063df9f4e19f7961ddb8e2393fa0f272... \n", - "5 b30bc667a324ae111d025526563b674a8d3fd869bc07c8... \n", - "6 b60a87c1d62a59d517f2fd6f2d3ea1a96c58b651332a8b... \n", - "7 e470e7b42a92c8e5f25094362361947b9203e0074c2223... \n", - "8 fa7ce2f66a7a5e061813d36348425f81d9e7ebc23454d8... \n", - "9 fb53bb607f2e9642d7fe044585d1dcdb052c57febe1b87... " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Prepare the data query\n", - "query = DataQuery(\n", - " search_query=\"*\", # The search query to be executed\n", - " source=[ # Which fields of documents we want to fetch\n", - " \"file-info.document-hash\",\n", - " \"file-info.filename\",\n", - " # \"description.title\",\n", - " ],\n", - " coordinates=coll_coords, # The data collection to be queries\n", - ")\n", - "\n", - "# Query Deep Search for the documents matching the query\n", - "results = []\n", - "query_results = api.queries.run(query)\n", - "for row in query_results.outputs[\"data_outputs\"]:\n", - " # Add row to results table\n", - " results.append(\n", - " {\n", - " \"Filename\": row[\"_source\"][\"file-info\"][\"filename\"],\n", - " \"DocHash\": row[\"_source\"][\"file-info\"][\"document-hash\"],\n", - " # \"Title\": row[\"_source\"].get(\"description\", {}).get(\"title\"),\n", - " }\n", - " )\n", - "\n", - "print(f\"Finished fetching all data. Total is {len(results)} records.\")\n", - "\n", - "# Visualize the table with all results\n", - "df = pd.json_normalize(results)\n", - "display(df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare source" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from deepsearch.cps.client.components.documents import (\n", - " PrivateDataCollectionSource,\n", - " PrivateDataDocumentSource,\n", - " PublicDataDocumentSource,\n", - ")\n", - "\n", - "data_source = PrivateDataCollectionSource(\n", - " source=coll_coords,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Ingestion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the cell below we show how to semantically index your collection (indexing of already indexed docs is controlled via param `skip_ingested_docs`):" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/pva/work/github.com/DS4SD/deepsearch-examples/.venv/lib/python3.10/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n", - " Expected `list[str]` but got `_LiteralGenericAlias` - serialized value may not be as expected\n", - " return self.__pydantic_serializer__.to_python(\n" - ] - }, - { - "data": { - "text/plain": [ - "{'ing_out': {}}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# launch the ingestion of the collection for DocumentQA\n", - "task = api.documents.semantic_ingest(\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " skip_ingested_docs=SKIP_INGESTED_DOCS,\n", - ")\n", - "\n", - "# wait for the ingestion task to finish\n", - "api.tasks.wait_for(PROJ_KEY, task.task_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## RAG" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "ee573e76-98ea-43ce-a2ba-a81f64b3adf3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RAGResult(\n",
-       "    answers=[\n",
-       "        RAGAnswerItem(\n",
-       "            answer='The IBM lab in Zurich is located in Rüschlikon, Switzerland.',\n",
-       "            grounding=RAGGroundingInfo(\n",
-       "                retr_items=[\n",
-       "                    SearchResultItem(\n",
-       "                        doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "                        chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, \n",
-       "ZRL) is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, \n",
-       "Switzerland.',\n",
-       "                        main_path='main-text.70',\n",
-       "                        path_group=['main-text.69', 'main-text.70'],\n",
-       "                        source_is_text=True\n",
-       "                    )\n",
-       "                ],\n",
-       "                gen_ctx_paths=[\n",
-       "                    'main-text.58',\n",
-       "                    'main-text.59',\n",
-       "                    'main-text.60',\n",
-       "                    'main-text.61',\n",
-       "                    'main-text.62',\n",
-       "                    'main-text.63',\n",
-       "                    'main-text.64',\n",
-       "                    'main-text.65',\n",
-       "                    'main-text.66',\n",
-       "                    'main-text.67',\n",
-       "                    'main-text.68',\n",
-       "                    'main-text.69',\n",
-       "                    'main-text.70',\n",
-       "                    'main-text.71',\n",
-       "                    'main-text.72',\n",
-       "                    'main-text.73',\n",
-       "                    'main-text.74',\n",
-       "                    'main-text.75',\n",
-       "                    'main-text.76'\n",
-       "                ]\n",
-       "            ),\n",
-       "            prompt=None\n",
-       "        )\n",
-       "    ],\n",
-       "    search_result_items=[\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the \n",
-       "European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.',\n",
-       "            main_path='main-text.70',\n",
-       "            path_group=['main-text.69', 'main-text.70'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \n",
-       "Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \n",
-       "staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \n",
-       "students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\n",
-       "(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and \n",
-       "solutions.',\n",
-       "            main_path='main-text.71',\n",
-       "            path_group=['main-text.69', 'main-text.71'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \n",
-       "scientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \n",
-       "Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.',\n",
-       "            main_path='main-text.74',\n",
-       "            path_group=['main-text.69', 'main-text.74'],\n",
-       "            source_is_text=True\n",
-       "        )\n",
-       "    ]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswer\u001b[0m=\u001b[32m'The IBM lab in Zurich is located in Rüschlikon, Switzerland.'\u001b[0m,\n", - " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, \u001b[0m\n", - "\u001b[32mZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, \u001b[0m\n", - "\u001b[32mSwitzerland.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.58'\u001b[0m,\n", - " \u001b[32m'main-text.59'\u001b[0m,\n", - " \u001b[32m'main-text.60'\u001b[0m,\n", - " \u001b[32m'main-text.61'\u001b[0m,\n", - " \u001b[32m'main-text.62'\u001b[0m,\n", - " \u001b[32m'main-text.63'\u001b[0m,\n", - " \u001b[32m'main-text.64'\u001b[0m,\n", - " \u001b[32m'main-text.65'\u001b[0m,\n", - " \u001b[32m'main-text.66'\u001b[0m,\n", - " \u001b[32m'main-text.67'\u001b[0m,\n", - " \u001b[32m'main-text.68'\u001b[0m,\n", - " \u001b[32m'main-text.69'\u001b[0m,\n", - " \u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[32m'main-text.72'\u001b[0m,\n", - " \u001b[32m'main-text.73'\u001b[0m,\n", - " \u001b[32m'main-text.74'\u001b[0m,\n", - " \u001b[32m'main-text.75'\u001b[0m,\n", - " \u001b[32m'main-text.76'\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n", - "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n", - "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n", - "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n", - "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n", - "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n", - "\u001b[32msolutions.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n", - "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n", - "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Where is the IBM lab in Zurich?\"\n", - "\n", - "# submit natural-language query on collection\n", - "question_query = RAGQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " ## optional retrieval params\n", - " retr_k=RETR_K,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Additionally, we can generate a provenance URL to the document in the Deep Search UI:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyYjMwYmM2NjdhMzI0YWUxMTFkMDI1NTI2NTYzYjY3NGE4ZDNmZDg2OWJjMDdjOGZkMjA0YWE5NWIwNWQ0MWYwYyU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmIzMGJjNjY3YTMyNGFlMTExZDAyNTUyNjU2M2I2NzRhOGQzZmQ4NjliYzA3YzhmZDIwNGFhOTViMDVkNDFmMGMlMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E3MCU3RCU3RA%3D%3D)." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "render_provenance_url(\n", - " api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us try out a different question on our document corpus.\n", - "Here we also include (commented out) various additional parameters the user can optionally set:\n", - "- `retr_k`: number of items to retrieve\n", - "- `text_weight`: weight of lexical search (`0.0`: fully semantic search, `1.0`: fully lexical search, anything in-between: hybrid search)\n", - "- `rerank`: whether to rerank the retrieval results\n", - "- `gen_ctx_extr_method` (Literal[\"window\", \"page\"], optional): method for gen context extraction from document; defaults to \"window\"\n", - "- `gen_ctx_window_size` (int, optional): (relevant only if `gen_ctx_extr_method` is \"window\") max chars to use for extracted gen context (actual extraction quantized on doc item level); defaults to 5000\n", - "- `gen_ctx_window_lead_weight` (float, optional): (relevant only if `gen_ctx_extr_method` is \"window\") weight of leading text for distributing remaining window size after extracting the `main_path`; defaults to 0.5 (centered around `main_path`)\n", - "- `return_prompt` (bool, optional): whether to return the instantiated prompt; defaults to False\n", - "- `gen_timeout` (float, optional): timeout for LLM generation; defaults to None, i.e. determined by system\n", - "\n", - "For more details refer to `deepsearch.cps.queries.RAGQuery`." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "89d95a17-1569-4c90-a983-8ca437b7569d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RAGResult(\n",
-       "    answers=[\n",
-       "        RAGAnswerItem(\n",
-       "            answer='The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in \n",
-       "the field of computer gaming and artificial intelligence.',\n",
-       "            grounding=RAGGroundingInfo(\n",
-       "                retr_items=[\n",
-       "                    SearchResultItem(\n",
-       "                        doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
-       "                        chunk='History and relationships to other fields\\nThe term machine learning was coined in \n",
-       "1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \n",
-       "$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]',\n",
-       "                        main_path='main-text.6',\n",
-       "                        path_group=['main-text.5', 'main-text.6'],\n",
-       "                        source_is_text=True\n",
-       "                    )\n",
-       "                ],\n",
-       "                gen_ctx_paths=[\n",
-       "                    'main-text.1',\n",
-       "                    'main-text.2',\n",
-       "                    'main-text.3',\n",
-       "                    'main-text.4',\n",
-       "                    'main-text.5',\n",
-       "                    'main-text.6',\n",
-       "                    'main-text.7',\n",
-       "                    'main-text.8',\n",
-       "                    'main-text.9',\n",
-       "                    'main-text.10'\n",
-       "                ]\n",
-       "            ),\n",
-       "            prompt=None\n",
-       "        )\n",
-       "    ],\n",
-       "    search_result_items=[\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
-       "            chunk='History and relationships to other fields\\nThe term machine learning was coined in 1959 by \n",
-       "Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \n",
-       "$^{[10][11]}$ The synonym self-teaching computers was also used in this time period. [12][13]',\n",
-       "            main_path='main-text.6',\n",
-       "            path_group=['main-text.5', 'main-text.6'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
-       "            chunk=\"Machine learning\\nMachine learning (ML) is an umbrella term for solving problems for which \n",
-       "development of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by \n",
-       "helping machines 'discover ' their 'own ' algorithms, $^{[1]}$ without needing to be explicitly told what to do by \n",
-       "any human-developed algorithms. $^{[2]}$ Recently, generative artificial neural networks have been able to surpass \n",
-       "results of many previous approaches. $^{[3][4]}$ Machine-learning approaches have been applied to large language \n",
-       "models, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to \n",
-       "develop algorithms to perform the needed tasks. [5][6]\",\n",
-       "            main_path='main-text.2',\n",
-       "            path_group=['main-text.1', 'main-text.2'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1',\n",
-       "            chunk='Artificial intelligence\\nMachine learning (ML), reorganized and recognized as its own field, \n",
-       "started to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling \n",
-       "solvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from \n",
-       "AI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. [24]',\n",
-       "            main_path='main-text.15',\n",
-       "            path_group=['main-text.10', 'main-text.15'],\n",
-       "            source_is_text=True\n",
-       "        )\n",
-       "    ]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswer\u001b[0m=\u001b[32m'The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in \u001b[0m\n", - "\u001b[32mthe field of computer gaming and artificial intelligence.'\u001b[0m,\n", - " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in \u001b[0m\n", - "\u001b[32m1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n", - "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.1'\u001b[0m,\n", - " \u001b[32m'main-text.2'\u001b[0m,\n", - " \u001b[32m'main-text.3'\u001b[0m,\n", - " \u001b[32m'main-text.4'\u001b[0m,\n", - " \u001b[32m'main-text.5'\u001b[0m,\n", - " \u001b[32m'main-text.6'\u001b[0m,\n", - " \u001b[32m'main-text.7'\u001b[0m,\n", - " \u001b[32m'main-text.8'\u001b[0m,\n", - " \u001b[32m'main-text.9'\u001b[0m,\n", - " \u001b[32m'main-text.10'\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'History and relationships to other fields\\nThe term machine learning was coined in 1959 by \u001b[0m\n", - "\u001b[32mArthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. \u001b[0m\n", - "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ The synonym self-teaching computers was also used in this time period. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.6'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.6'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"Machine\u001b[0m\u001b[32m learning\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is an umbrella term for solving problems for which \u001b[0m\n", - "\u001b[32mdevelopment of algorithms by human programmers would be cost-prohibitive, and instead the problems are solved by \u001b[0m\n", - "\u001b[32mhelping machines 'discover ' their 'own ' algorithms, $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ without needing to be explicitly told what to do by \u001b[0m\n", - "\u001b[32many human-developed algorithms. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Recently, generative artificial neural networks have been able to surpass \u001b[0m\n", - "\u001b[32mresults of many previous approaches. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m3\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Machine-learning approaches have been applied to large language \u001b[0m\n", - "\u001b[32mmodels, computer vision, speech recognition, email filtering, agriculture and medicine, where it is too costly to \u001b[0m\n", - "\u001b[32mdevelop algorithms to perform the needed tasks. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m5\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.2'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.1'\u001b[0m, \u001b[32m'main-text.2'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'e470e7b42a92c8e5f25094362361947b9203e0074c2223505b4921940ec075a1'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Artificial intelligence\\nMachine learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mML\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, reorganized and recognized as its own field, \u001b[0m\n", - "\u001b[32mstarted to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling \u001b[0m\n", - "\u001b[32msolvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from \u001b[0m\n", - "\u001b[32mAI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m24\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.15'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.10'\u001b[0m, \u001b[32m'main-text.15'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Who came up with the term 'machine learning'?\"\n", - "\n", - "# submit natural-language query on collection\n", - "question_query = RAGQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " ## optional retrieval params\n", - " retr_k=RETR_K,\n", - " # text_weight=TEXT_WEIGHT,\n", - " # rerank=RERANK,\n", - " ## optional generation params\n", - " # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n", - " # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n", - " # prompt_template=\"Answer the query based on the context.\\n\\nContext: {{ context }}\\n\\nQuery: {{ query }}\",\n", - " # gen_ctx_extr_method=\"window\",\n", - " # gen_ctx_window_size=5000,\n", - " # gen_ctx_window_lead_weight=0.5\n", - " # return_prompt=True,\n", - " # gen_timeout=10.0,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As seen by the returned `doc_hash`, this answer came from a different document than the previous one." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyZTQ3MGU3YjQyYTkyYzhlNWYyNTA5NDM2MjM2MTk0N2I5MjAzZTAwNzRjMjIyMzUwNWI0OTIxOTQwZWMwNzVhMSU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmU0NzBlN2I0MmE5MmM4ZTVmMjUwOTQzNjIzNjE5NDdiOTIwM2UwMDc0YzIyMjM1MDViNDkyMTk0MGVjMDc1YTElMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E2JTdEJTdE)." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "render_provenance_url(\n", - " api=api, coords=coll_coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Semantic retrieval\n", - "\n", - "Besides RAG, which includes natural language generation, a user may only be interested in\n", - "the semantic retrieval part.\n", - "\n", - "This can be obtained very similarly to RAG, as shown below:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SearchResult(\n",
-       "    search_result_items=[\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the \n",
-       "European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.',\n",
-       "            main_path='main-text.70',\n",
-       "            path_group=['main-text.69', 'main-text.70'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \n",
-       "Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \n",
-       "staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \n",
-       "students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\n",
-       "(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and \n",
-       "solutions.',\n",
-       "            main_path='main-text.71',\n",
-       "            path_group=['main-text.69', 'main-text.71'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \n",
-       "scientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \n",
-       "Industry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.',\n",
-       "            main_path='main-text.74',\n",
-       "            path_group=['main-text.69', 'main-text.74'],\n",
-       "            source_is_text=True\n",
-       "        )\n",
-       "    ]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSearchResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n", - "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n", - "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n", - "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n", - "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n", - "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n", - "\u001b[32msolutions.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nThe research projects pursued at the IBM Zurich lab are organized into four \u001b[0m\n", - "\u001b[32mscientific and technical departments: Science & Technology, Cloud and AI Systems Research, Cognitive Computing & \u001b[0m\n", - "\u001b[32mIndustry Solutions and Security Research. The lab is currently managed by Alessandro Curioni.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.74'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.74'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Where is the IBM lab in Zurich?\"\n", - "\n", - "# submit natural-language query on collection\n", - "question_query = SemanticQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " ## optional params\n", - " retr_k=RETR_K,\n", - " # text_weight=TEXT_WEIGHT,\n", - " # rerank=RERANK,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "search_result = SearchResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(search_result)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/qa_doc_collection/README.md b/examples/qa_rag_semantic_retrieval/README.md similarity index 68% rename from examples/qa_doc_collection/README.md rename to examples/qa_rag_semantic_retrieval/README.md index f54e8fc..843f90d 100644 --- a/examples/qa_doc_collection/README.md +++ b/examples/qa_rag_semantic_retrieval/README.md @@ -1,8 +1,9 @@ -# Document Collection QA +# Document QA -Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answers your questions using the information in a document collection. +Deep Search allows users to interact with the documents using conversational AI, i.e. the user interacts with a virtual assistant which answers their questions using the information from a document collection or a specific document. -:point_right: Run the [doc_collection_qa.ipynb](./doc_collection_qa.ipynb) notebook. +:point_right: For getting started, check out [QA Quick Start](./qa_quick_start.ipynb). +:point_right: For advanced usage, check out [QA Deep Dive](./qa_deep_dive.ipynb). ### Access required diff --git a/examples/qa_rag_semantic_retrieval/qa_deep_dive.ipynb b/examples/qa_rag_semantic_retrieval/qa_deep_dive.ipynb index 2bbbf5d..b64019c 100644 --- a/examples/qa_rag_semantic_retrieval/qa_deep_dive.ipynb +++ b/examples/qa_rag_semantic_retrieval/qa_deep_dive.ipynb @@ -7,9 +7,9 @@ "source": [ "# QA Deep Dive\n", "\n", - "Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answer your questions using the information in the document.\n", + "In this QA Deep Dive notebook, we take a closer look at semantic ingestion, RAG, and retrieval, presenting the various customization options, and providing hints to help you make the most out of your QA application.\n", "\n", - "In this example we demonstrate how achive the same interaction programmatically.\n", + "For getting started with basic QA usage, check out [QA Quick Start](./qa_quick_start.ipynb).\n", "\n", "### Access required\n", "\n", @@ -243,7 +243,7 @@ "source": [ "### RAG\n", "\n", - "Besides the standard RAG usage shown in `quick_start.ipynb`, `RAGQuery` has numerous additional parameters for customizing aspects of retrieval, generation, and overall RAG pipeline:\n" + "Besides the standard RAG usage shown in [QA Quick Start](./qa_quick_start.ipynb), `RAGQuery` has numerous additional parameters for customizing aspects of retrieval, generation, and overall RAG pipeline:\n" ] }, { @@ -634,16 +634,16 @@ "data": { "text/html": [ "
QueryTimings(\n",
-       "    overall=1.6069965502247214,\n",
+       "    overall=1.1071437392383814,\n",
        "    tasks={\n",
        "        'QA': TaskTimings(\n",
-       "            overall=1.606641505844891,\n",
+       "            overall=1.106700461357832,\n",
        "            details={\n",
-       "                'pipeline': 1.0898093581199646,\n",
-       "                'encode': 0.03505689650774002,\n",
-       "                'search': 0.02780105359852314,\n",
+       "                'pipeline': 0.7934164367616177,\n",
+       "                'encode': 0.02867589332163334,\n",
+       "                'search': 0.01751641556620598,\n",
        "                'rerank': 0.0,\n",
-       "                'generate': 0.9914452265948057\n",
+       "                'generate': 0.7073831260204315\n",
        "            }\n",
        "        )\n",
        "    }\n",
@@ -652,16 +652,16 @@
       ],
       "text/plain": [
        "\u001b[1;35mQueryTimings\u001b[0m\u001b[1m(\u001b[0m\n",
-       "    \u001b[33moverall\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1;36m.6069965502247214\u001b[0m,\n",
+       "    \u001b[33moverall\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1;36m.1071437392383814\u001b[0m,\n",
        "    \u001b[33mtasks\u001b[0m=\u001b[1m{\u001b[0m\n",
        "        \u001b[32m'QA'\u001b[0m: \u001b[1;35mTaskTimings\u001b[0m\u001b[1m(\u001b[0m\n",
-       "            \u001b[33moverall\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1;36m.606641505844891\u001b[0m,\n",
+       "            \u001b[33moverall\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1;36m.106700461357832\u001b[0m,\n",
        "            \u001b[33mdetails\u001b[0m=\u001b[1m{\u001b[0m\n",
-       "                \u001b[32m'pipeline'\u001b[0m: \u001b[1;36m1.0898093581199646\u001b[0m,\n",
-       "                \u001b[32m'encode'\u001b[0m: \u001b[1;36m0.03505689650774002\u001b[0m,\n",
-       "                \u001b[32m'search'\u001b[0m: \u001b[1;36m0.02780105359852314\u001b[0m,\n",
+       "                \u001b[32m'pipeline'\u001b[0m: \u001b[1;36m0.7934164367616177\u001b[0m,\n",
+       "                \u001b[32m'encode'\u001b[0m: \u001b[1;36m0.02867589332163334\u001b[0m,\n",
+       "                \u001b[32m'search'\u001b[0m: \u001b[1;36m0.01751641556620598\u001b[0m,\n",
        "                \u001b[32m'rerank'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n",
-       "                \u001b[32m'generate'\u001b[0m: \u001b[1;36m0.9914452265948057\u001b[0m\n",
+       "                \u001b[32m'generate'\u001b[0m: \u001b[1;36m0.7073831260204315\u001b[0m\n",
        "            \u001b[1m}\u001b[0m\n",
        "        \u001b[1m)\u001b[0m\n",
        "    \u001b[1m}\u001b[0m\n",
@@ -683,7 +683,7 @@
    "source": [
     "### Semantic retrieval\n",
     "\n",
-    "Besides the standard semantic retrieval usage shown in `quick_start.ipynb`, `SemanticQuery` has numerous additional parameters:"
+    "Besides the standard semantic retrieval usage shown in [QA Quick Start](./qa_quick_start.ipynb), `SemanticQuery` has numerous additional parameters:"
    ]
   },
   {
@@ -861,15 +861,15 @@
      "data": {
       "text/html": [
        "
QueryTimings(\n",
-       "    overall=1.1053776880726218,\n",
+       "    overall=1.1048420211300254,\n",
        "    tasks={\n",
        "        'QA': TaskTimings(\n",
-       "            overall=1.104994640685618,\n",
+       "            overall=1.1045362269505858,\n",
        "            details={\n",
-       "                'pipeline': 0.6104133129119873,\n",
-       "                'encode': 0.02873420901596546,\n",
-       "                'search': 0.019157135859131813,\n",
-       "                'rerank': 0.5614922270178795\n",
+       "                'pipeline': 0.6429227869957685,\n",
+       "                'encode': 0.0342963021248579,\n",
+       "                'search': 0.026565583422780037,\n",
+       "                'rerank': 0.5808815937489271\n",
        "            }\n",
        "        )\n",
        "    }\n",
@@ -878,15 +878,15 @@
       ],
       "text/plain": [
        "\u001b[1;35mQueryTimings\u001b[0m\u001b[1m(\u001b[0m\n",
-       "    \u001b[33moverall\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1;36m.1053776880726218\u001b[0m,\n",
+       "    \u001b[33moverall\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1;36m.1048420211300254\u001b[0m,\n",
        "    \u001b[33mtasks\u001b[0m=\u001b[1m{\u001b[0m\n",
        "        \u001b[32m'QA'\u001b[0m: \u001b[1;35mTaskTimings\u001b[0m\u001b[1m(\u001b[0m\n",
-       "            \u001b[33moverall\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1;36m.104994640685618\u001b[0m,\n",
+       "            \u001b[33moverall\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1;36m.1045362269505858\u001b[0m,\n",
        "            \u001b[33mdetails\u001b[0m=\u001b[1m{\u001b[0m\n",
-       "                \u001b[32m'pipeline'\u001b[0m: \u001b[1;36m0.6104133129119873\u001b[0m,\n",
-       "                \u001b[32m'encode'\u001b[0m: \u001b[1;36m0.02873420901596546\u001b[0m,\n",
-       "                \u001b[32m'search'\u001b[0m: \u001b[1;36m0.019157135859131813\u001b[0m,\n",
-       "                \u001b[32m'rerank'\u001b[0m: \u001b[1;36m0.5614922270178795\u001b[0m\n",
+       "                \u001b[32m'pipeline'\u001b[0m: \u001b[1;36m0.6429227869957685\u001b[0m,\n",
+       "                \u001b[32m'encode'\u001b[0m: \u001b[1;36m0.0342963021248579\u001b[0m,\n",
+       "                \u001b[32m'search'\u001b[0m: \u001b[1;36m0.026565583422780037\u001b[0m,\n",
+       "                \u001b[32m'rerank'\u001b[0m: \u001b[1;36m0.5808815937489271\u001b[0m\n",
        "            \u001b[1m}\u001b[0m\n",
        "        \u001b[1m)\u001b[0m\n",
        "    \u001b[1m}\u001b[0m\n",
diff --git a/examples/qa_rag_semantic_retrieval/qa_quick_start.ipynb b/examples/qa_rag_semantic_retrieval/qa_quick_start.ipynb
index d183d45..1b3e271 100644
--- a/examples/qa_rag_semantic_retrieval/qa_quick_start.ipynb
+++ b/examples/qa_rag_semantic_retrieval/qa_quick_start.ipynb
@@ -7,9 +7,9 @@
    "source": [
     "# QA Quick Start\n",
     "\n",
-    "Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answer your questions using the information in the document.\n",
+    "In this QA Quick Start notebook, we take a first look at semantic ingestion, RAG, and retrieval, presenting basic usage patterns.\n",
     "\n",
-    "In this example we demonstrate how achive the same interaction programmatically.\n",
+    "For more advanced usage, check out [QA Deep Dive](./qa_deep_dive.ipynb).\n",
     "\n",
     "### Access required\n",
     "\n",
@@ -117,7 +117,7 @@
     "\n",
     "The cell below shows how to configure a private data source, i.e. either a whole private collection (in which case `document_hash` should be `None` or omitted) or a given doc within one.\n",
     "\n",
-    "For more details on data sources check out `qa_deep_dive.ipynb`."
+    "For more details on data sources check out [QA Deep Dive](./qa_deep_dive.ipynb)."
    ]
   },
   {
@@ -185,7 +185,7 @@
     "\n",
     "The cell below demonstrates basic RAG usage.\n",
     "\n",
-    "For more advanced usage and parametrization, check out qa_deep_dive.ipynb."
+    "For more advanced usage and parametrization, check out [QA Deep Dive](./qa_deep_dive.ipynb)."
    ]
   },
   {
diff --git a/examples/qa_single_doc/README.md b/examples/qa_single_doc/README.md
deleted file mode 100644
index aa59a40..0000000
--- a/examples/qa_single_doc/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# Single Document QA
-
-Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answers your questions using the information in a given document.
-
-:point_right: Run the [single_doc_qa.ipynb](./single_doc_qa.ipynb) notebook.
-
-
-### Access required
-
-The content of this notebook requires access to Deep Search capabilities which are not
-available on the public access system.
-
-[Contact us](https://ds4sd.github.io/) if you are interested in exploring
-the enterprise-level Deep Search capabilities.
-
-
-### GenAI Integration required
-
-When interacting with the virtual assistant, Deep Search requires a connection to a Generative AI API. Currently, we support connections to [watsonx.ai](https://www.ibm.com/products/watsonx-ai) or the IBM-internal GenAI platform BAM.
-
-Deep Search allows custom GenAI configurations for each project.
-In the above notebooks you will need to work in a project which has such GenAI capabilities activated.
diff --git a/examples/qa_single_doc/single_doc_qa.ipynb b/examples/qa_single_doc/single_doc_qa.ipynb
deleted file mode 100644
index 90ab125..0000000
--- a/examples/qa_single_doc/single_doc_qa.ipynb
+++ /dev/null
@@ -1,1675 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "99f717ef-4cba-4300-b258-0b1c248cb873",
-   "metadata": {},
-   "source": [
-    "# RAG and Semantic Retrieval on a Single Document\n",
-    "\n",
-    "Deep Search allows users to interact with the documents using conversational AI, i.e. you interact with a virtual assistant which answer your questions using the information in the document.\n",
-    "\n",
-    "In this example we demonstrate how achive the same interaction programmatically.\n",
-    "\n",
-    "### Access required\n",
-    "\n",
-    "The content of this notebook requires access to Deep Search capabilities which are not\n",
-    "available on the public access system.\n",
-    "\n",
-    "[Contact us](https://ds4sd.github.io) if you are interested in exploring\n",
-    "these Deep Search capabilities.\n",
-    "\n",
-    "\n",
-    "### GenAI Integration required\n",
-    "\n",
-    "When interacting with the virtual assistant, Deep Search requires a connection to a Generative AI API. Currently, we support connections to [watsonx.ai](https://www.ibm.com/products/watsonx-ai) or the IBM-internal GenAI platform BAM.\n",
-    "\n",
-    "Deep Search allows custom GenAI configurations for each project.\n",
-    "In the following example you will require to work in a project which has such GenAI capabilities activated."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "256aef50-71a1-4278-9b22-17cb99a6566e",
-   "metadata": {},
-   "source": [
-    "### Set notebook parameters\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "5b244bdd-1b52-41ff-b63e-9a203570d210",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dsnotebooks.settings import DocQANotebookSettings\n",
-    "\n",
-    "# notebooks settings auto-loaded from .env / env vars\n",
-    "notebook_settings = DocQANotebookSettings()\n",
-    "\n",
-    "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
-    "PROJ_KEY = notebook_settings.proj_key  # the project to use\n",
-    "\n",
-    "# index and doc for doc QA from semantically indexed collection\n",
-    "SEM_ON_IDX_KEY = notebook_settings.sem_on_idx_key\n",
-    "SEM_ON_IDX_DOC_HASH = notebook_settings.sem_on_idx_doc_hash\n",
-    "\n",
-    "# index and doc for doc QA from not semantically indexed collection\n",
-    "SEM_OFF_IDX_KEY = notebook_settings.sem_off_idx_key\n",
-    "SEM_OFF_IDX_DOC_HASH = notebook_settings.sem_off_idx_doc_hash\n",
-    "\n",
-    "SKIP_INGESTED_DOCS = (\n",
-    "    notebook_settings.skip_ingested_docs\n",
-    ")  # whether to skip any already semantically ingested docs\n",
-    "\n",
-    "RETR_K = notebook_settings.retr_k  # the number of search results to retrieve\n",
-    "TEXT_WEIGHT = (\n",
-    "    notebook_settings.text_weight\n",
-    ")  # the weight of lexical search (0.0: semantic-only, 1.0: lexical-only, anything in between: hybrid search)\n",
-    "RERANK = notebook_settings.rerank  # whether to rerank the search results\n",
-    "RAISE = (\n",
-    "    notebook_settings.raise_on_sem_err\n",
-    ")  # whether semantic operation errors should raise an exception or be reflected in response fields"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a5269060-bb5f-4fe3-9b64-547202db6714",
-   "metadata": {},
-   "source": [
-    "### Import example dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "5d236ea0-db1c-4171-8e11-cdd0bad69d66",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import standard dependenices\n",
-    "import rich\n",
-    "\n",
-    "# IPython utilities\n",
-    "from IPython.display import display, Markdown\n",
-    "\n",
-    "# Import the deepsearch-toolkit\n",
-    "from deepsearch.cps.client.api import CpsApi\n",
-    "from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource\n",
-    "from deepsearch.cps.queries import RAGQuery, SemanticQuery\n",
-    "from deepsearch.cps.queries.results import RAGResult, SearchResult, SearchResultItem"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "293c249b-6018-46f2-b4d8-795f994d4729",
-   "metadata": {},
-   "source": [
-    "### Connect to Deep Search"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "9c108432-a285-4c7b-a996-008ac3ff3d7a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "api = CpsApi.from_env(profile_name=PROFILE_NAME)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Utils"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def render_provenance_url(\n",
-    "    api: CpsApi,\n",
-    "    coords: ElasticProjectDataCollectionSource,\n",
-    "    retr_item: SearchResultItem,\n",
-    "):\n",
-    "    ## compute URL to the document in the Deep Search UI\n",
-    "    item_index = int(retr_item.main_path[retr_item.main_path.rfind(\".\") + 1 :])\n",
-    "    doc_url = api.documents.generate_url(\n",
-    "        document_hash=retr_item.doc_hash,\n",
-    "        data_source=coords,\n",
-    "        item_index=item_index,\n",
-    "    )\n",
-    "    display(\n",
-    "        Markdown(\n",
-    "            f\"The provenance of the answer can be inspected on the [source document]({doc_url}).\"\n",
-    "        )\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "38cde869-46d1-4833-8eb3-2381b5e5fb68",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## QA on document within semantically indexed collection"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Prepare data source"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from deepsearch.cps.client.components.documents import PrivateDataDocumentSource\n",
-    "\n",
-    "coords = ElasticProjectDataCollectionSource(\n",
-    "    proj_key=PROJ_KEY,\n",
-    "    index_key=SEM_ON_IDX_KEY,\n",
-    ")\n",
-    "data_source = PrivateDataDocumentSource(\n",
-    "    source=coords,\n",
-    "    document_hash=SEM_ON_IDX_DOC_HASH,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### RAG"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If the document is part of a semantically indexed collection (see [Document Collection QA](https://github.com/DS4SD/deepsearch-examples/tree/main/examples/qa_doc_collection) for details),\n",
-    "we can directly do RAG on it as shown below:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "
RAGResult(\n",
-       "    answers=[\n",
-       "        RAGAnswerItem(\n",
-       "            answer='The first European IBM research lab was located in Adliswil, Switzerland, near Zurich. It was \n",
-       "opened in 1956.',\n",
-       "            grounding=RAGGroundingInfo(\n",
-       "                retr_items=[\n",
-       "                    SearchResultItem(\n",
-       "                        doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "                        chunk='Switzerland\\nIn 1956, IBM opened their first European research laboratory in \n",
-       "Adliswil, Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich \n",
-       "lab is staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, \n",
-       "graduate students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a \n",
-       "Client Center (formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology \n",
-       "prototypes and solutions.',\n",
-       "                        main_path='main-text.71',\n",
-       "                        path_group=['main-text.69', 'main-text.71'],\n",
-       "                        source_is_text=True\n",
-       "                    )\n",
-       "                ],\n",
-       "                gen_ctx_paths=[\n",
-       "                    'main-text.60',\n",
-       "                    'main-text.61',\n",
-       "                    'main-text.62',\n",
-       "                    'main-text.63',\n",
-       "                    'main-text.64',\n",
-       "                    'main-text.65',\n",
-       "                    'main-text.66',\n",
-       "                    'main-text.67',\n",
-       "                    'main-text.68',\n",
-       "                    'main-text.69',\n",
-       "                    'main-text.70',\n",
-       "                    'main-text.71',\n",
-       "                    'main-text.72',\n",
-       "                    'main-text.73',\n",
-       "                    'main-text.74',\n",
-       "                    'main-text.75',\n",
-       "                    'main-text.76',\n",
-       "                    'main-text.77',\n",
-       "                    'main-text.78'\n",
-       "                ]\n",
-       "            ),\n",
-       "            prompt=None\n",
-       "        )\n",
-       "    ],\n",
-       "    search_result_items=[\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \n",
-       "Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \n",
-       "staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \n",
-       "students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\n",
-       "(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and \n",
-       "solutions.',\n",
-       "            main_path='main-text.71',\n",
-       "            path_group=['main-text.69', 'main-text.71'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the \n",
-       "European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.',\n",
-       "            main_path='main-text.70',\n",
-       "            path_group=['main-text.69', 'main-text.70'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk=\"History\\nThe roots of today's IBM Research began with the 1945 opening of the Watson Scientific \n",
-       "Computing Laboratory at Columbia University. $^{[4]}$ This was the first IBM laboratory devoted to pure science and\n",
-       "later expanded into additional IBM Research locations in Westchester County, New York, starting in the 1950s, \n",
-       "$^{[5][6]}$ including the Thomas J. Watson Research Center in 1961. [5][6]\",\n",
-       "            main_path='main-text.8',\n",
-       "            path_group=['main-text.7', 'main-text.8'],\n",
-       "            source_is_text=True\n",
-       "        )\n",
-       "    ]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswer\u001b[0m=\u001b[32m'The first European IBM research lab was located in Adliswil, Switzerland, near Zurich. It was \u001b[0m\n", - "\u001b[32mopened in 1956.'\u001b[0m,\n", - " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in \u001b[0m\n", - "\u001b[32mAdliswil, Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich \u001b[0m\n", - "\u001b[32mlab is staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, \u001b[0m\n", - "\u001b[32mgraduate students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a \u001b[0m\n", - "\u001b[32mClient Center \u001b[0m\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology \u001b[0m\n", - "\u001b[32mprototypes and solutions.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.60'\u001b[0m,\n", - " \u001b[32m'main-text.61'\u001b[0m,\n", - " \u001b[32m'main-text.62'\u001b[0m,\n", - " \u001b[32m'main-text.63'\u001b[0m,\n", - " \u001b[32m'main-text.64'\u001b[0m,\n", - " \u001b[32m'main-text.65'\u001b[0m,\n", - " \u001b[32m'main-text.66'\u001b[0m,\n", - " \u001b[32m'main-text.67'\u001b[0m,\n", - " \u001b[32m'main-text.68'\u001b[0m,\n", - " \u001b[32m'main-text.69'\u001b[0m,\n", - " \u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[32m'main-text.72'\u001b[0m,\n", - " \u001b[32m'main-text.73'\u001b[0m,\n", - " \u001b[32m'main-text.74'\u001b[0m,\n", - " \u001b[32m'main-text.75'\u001b[0m,\n", - " \u001b[32m'main-text.76'\u001b[0m,\n", - " \u001b[32m'main-text.77'\u001b[0m,\n", - " \u001b[32m'main-text.78'\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n", - "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n", - "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n", - "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n", - "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n", - "\u001b[32msolutions.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n", - "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"History\u001b[0m\u001b[32m\\nThe roots of today's IBM Research began with the 1945 opening of the Watson Scientific \u001b[0m\n", - "\u001b[32mComputing Laboratory at Columbia University. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ This was the first IBM laboratory devoted to pure science and\u001b[0m\n", - "\u001b[32mlater expanded into additional IBM Research locations in Westchester County, New York, starting in the 1950s, \u001b[0m\n", - "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m5\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ including the Thomas J. Watson Research Center in 1961. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m5\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.8'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.7'\u001b[0m, \u001b[32m'main-text.8'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Where was the first European IBM research lab located?\"\n", - "\n", - "# submit natural-language query on document\n", - "question_query = RAGQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " ## optional retrieval params\n", - " retr_k=RETR_K,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Additionally, we can generate a provenance URL to the document in the Deep Search UI:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyYjMwYmM2NjdhMzI0YWUxMTFkMDI1NTI2NTYzYjY3NGE4ZDNmZDg2OWJjMDdjOGZkMjA0YWE5NWIwNWQ0MWYwYyU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmIzMGJjNjY3YTMyNGFlMTExZDAyNTUyNjU2M2I2NzRhOGQzZmQ4NjliYzA3YzhmZDIwNGFhOTViMDVkNDFmMGMlMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E3MSU3RCU3RA%3D%3D)." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "render_provenance_url(\n", - " api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us try out a different question on our document corpus.\n", - "Here we also include (commented out) various additional parameters the user can optionally set:\n", - "- `retr_k`: number of items to retrieve\n", - "- `text_weight`: weight of lexical search (`0.0`: fully semantic search, `1.0`: fully lexical search, anything in-between: hybrid search)\n", - "- `rerank`: whether to rerank the retrieval results\n", - "- `gen_ctx_extr_method` (Literal[\"window\", \"page\"], optional): method for gen context extraction from document; defaults to \"window\"\n", - "- `gen_ctx_window_size` (int, optional): (relevant only if `gen_ctx_extr_method` is \"window\") max chars to use for extracted gen context (actual extraction quantized on doc item level); defaults to 5000\n", - "- `gen_ctx_window_lead_weight` (float, optional): (relevant only if `gen_ctx_extr_method` is \"window\") weight of leading text for distributing remaining window size after extracting the `main_path`; defaults to 0.5 (centered around `main_path`)\n", - "- `return_prompt` (bool, optional): whether to return the instantiated prompt; defaults to False\n", - "- `gen_timeout` (float, optional): timeout for LLM generation; defaults to None, i.e. determined by system\n", - "\n", - "For more details refer to `deepsearch.cps.queries.RAGQuery`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RAGResult(\n",
-       "    answers=[\n",
-       "        RAGAnswerItem(\n",
-       "            answer='IBM currently has 19 research facilities spread across 12 laboratories on six continents. The \n",
-       "12 laboratories are: Africa, Almaden, Australia, Brazil, Cambridge-IBM Research and MIT-IBM Watson AI Lab, China, \n",
-       "India, Israel, Ireland, Japan, Switzerland, and IBM Thomas J. Watson Research Center. Each laboratory typically has\n",
-       "several research facilities located in different geographical areas.',\n",
-       "            grounding=RAGGroundingInfo(\n",
-       "                retr_items=[\n",
-       "                    SearchResultItem(\n",
-       "                        doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "                        chunk='Laboratories\\nIBM currently has 19 research facilities spread across 12 laboratories\n",
-       "on six continents: [32]\\nAfrica (Nairobi, Kenya, and Johannesburg, South Africa)\\nAlmaden (San Jose)\\nAustralia \n",
-       "(Melbourne)\\nBrazil (Sao Paulo and Rio de Janeiro)\\nCambridge-IBM Research and MIT-IBM Watson AI Lab (Cambridge, \n",
-       "US)\\nChina (Beijing)\\nIsrael (Haifa)\\nIreland (Dublin)\\nIndia (Delhi and Bengaluru)\\nJapan (Tokyo and \n",
-       "Shin-kawasaki)\\nSwitzerland (Zurich)\\nIBM Thomas J. Watson Research Center (Yorktown Heights and Albany)',\n",
-       "                        main_path='main-text.17',\n",
-       "                        path_group=[\n",
-       "                            'main-text.16',\n",
-       "                            'main-text.17',\n",
-       "                            'main-text.18',\n",
-       "                            'main-text.19',\n",
-       "                            'main-text.20',\n",
-       "                            'main-text.21',\n",
-       "                            'main-text.22',\n",
-       "                            'main-text.23',\n",
-       "                            'main-text.24',\n",
-       "                            'main-text.25',\n",
-       "                            'main-text.26',\n",
-       "                            'main-text.27',\n",
-       "                            'main-text.28',\n",
-       "                            'main-text.29'\n",
-       "                        ],\n",
-       "                        source_is_text=True\n",
-       "                    )\n",
-       "                ],\n",
-       "                gen_ctx_paths=[\n",
-       "                    'main-text.11',\n",
-       "                    'main-text.12',\n",
-       "                    'main-text.13',\n",
-       "                    'main-text.14',\n",
-       "                    'main-text.15',\n",
-       "                    'main-text.16',\n",
-       "                    'main-text.17',\n",
-       "                    'main-text.18',\n",
-       "                    'main-text.19',\n",
-       "                    'main-text.20',\n",
-       "                    'main-text.21',\n",
-       "                    'main-text.22',\n",
-       "                    'main-text.23',\n",
-       "                    'main-text.24',\n",
-       "                    'main-text.25',\n",
-       "                    'main-text.26',\n",
-       "                    'main-text.27',\n",
-       "                    'main-text.28',\n",
-       "                    'main-text.29',\n",
-       "                    'main-text.30',\n",
-       "                    'main-text.31',\n",
-       "                    'main-text.32',\n",
-       "                    'main-text.33',\n",
-       "                    'main-text.34',\n",
-       "                    'main-text.35',\n",
-       "                    'main-text.36',\n",
-       "                    'main-text.37'\n",
-       "                ]\n",
-       "            ),\n",
-       "            prompt=None\n",
-       "        )\n",
-       "    ],\n",
-       "    search_result_items=[\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Laboratories\\nIBM currently has 19 research facilities spread across 12 laboratories on six \n",
-       "continents: [32]\\nAfrica (Nairobi, Kenya, and Johannesburg, South Africa)\\nAlmaden (San Jose)\\nAustralia \n",
-       "(Melbourne)\\nBrazil (Sao Paulo and Rio de Janeiro)\\nCambridge-IBM Research and MIT-IBM Watson AI Lab (Cambridge, \n",
-       "US)\\nChina (Beijing)\\nIsrael (Haifa)\\nIreland (Dublin)\\nIndia (Delhi and Bengaluru)\\nJapan (Tokyo and \n",
-       "Shin-kawasaki)\\nSwitzerland (Zurich)\\nIBM Thomas J. Watson Research Center (Yorktown Heights and Albany)',\n",
-       "            main_path='main-text.17',\n",
-       "            path_group=[\n",
-       "                'main-text.16',\n",
-       "                'main-text.17',\n",
-       "                'main-text.18',\n",
-       "                'main-text.19',\n",
-       "                'main-text.20',\n",
-       "                'main-text.21',\n",
-       "                'main-text.22',\n",
-       "                'main-text.23',\n",
-       "                'main-text.24',\n",
-       "                'main-text.25',\n",
-       "                'main-text.26',\n",
-       "                'main-text.27',\n",
-       "                'main-text.28',\n",
-       "                'main-text.29'\n",
-       "            ],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='(Redirected from IBM Almaden Research Center)\\nIBM employees have garnered six Nobel Prizes, six\n",
-       "Turing Awards, 20 inductees into the U.S. National Inventors Hall of Fame, 19 National Medals of Technology, five \n",
-       "National Medals of Science and three Kavli Prizes. $^{[2]}$ As of 2018, the company has generated more patents than\n",
-       "any other business in each of 25 consecutive years, which is a record. [3]',\n",
-       "            main_path='main-text.4',\n",
-       "            path_group=['main-text.2', 'main-text.4'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Laboratories\\nHistoric research centers for IBM also include IBM La Gaude (Nice), the Cambridge \n",
-       "Scientific Center, the IBM New York Scientific Center, 330 North Wabash (Chicago), IBM Austin Research Laboratory, \n",
-       "and IBM Laboratory Vienna. [33]',\n",
-       "            main_path='main-text.30',\n",
-       "            path_group=['main-text.16', 'main-text.30'],\n",
-       "            source_is_text=True\n",
-       "        )\n",
-       "    ]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswer\u001b[0m=\u001b[32m'IBM currently has 19 research facilities spread across 12 laboratories on six continents. The \u001b[0m\n", - "\u001b[32m12 laboratories are: Africa, Almaden, Australia, Brazil, Cambridge-IBM Research and MIT-IBM Watson AI Lab, China, \u001b[0m\n", - "\u001b[32mIndia, Israel, Ireland, Japan, Switzerland, and IBM Thomas J. Watson Research Center. Each laboratory typically has\u001b[0m\n", - "\u001b[32mseveral research facilities located in different geographical areas.'\u001b[0m,\n", - " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Laboratories\\nIBM currently has 19 research facilities spread across 12 laboratories\u001b[0m\n", - "\u001b[32mon six continents: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m32\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\nAfrica \u001b[0m\u001b[32m(\u001b[0m\u001b[32mNairobi, Kenya, and Johannesburg, South Africa\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nAlmaden \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSan Jose\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nAustralia \u001b[0m\n", - "\u001b[32m(\u001b[0m\u001b[32mMelbourne\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nBrazil \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSao Paulo and Rio de Janeiro\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nCambridge-IBM Research and MIT-IBM Watson AI Lab \u001b[0m\u001b[32m(\u001b[0m\u001b[32mCambridge, \u001b[0m\n", - "\u001b[32mUS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nChina \u001b[0m\u001b[32m(\u001b[0m\u001b[32mBeijing\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nIsrael \u001b[0m\u001b[32m(\u001b[0m\u001b[32mHaifa\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nIreland \u001b[0m\u001b[32m(\u001b[0m\u001b[32mDublin\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nIndia \u001b[0m\u001b[32m(\u001b[0m\u001b[32mDelhi and Bengaluru\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nJapan \u001b[0m\u001b[32m(\u001b[0m\u001b[32mTokyo and \u001b[0m\n", - "\u001b[32mShin-kawasaki\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nSwitzerland \u001b[0m\u001b[32m(\u001b[0m\u001b[32mZurich\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nIBM Thomas J. Watson Research Center \u001b[0m\u001b[32m(\u001b[0m\u001b[32mYorktown Heights and Albany\u001b[0m\u001b[32m)\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.17'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.16'\u001b[0m,\n", - " \u001b[32m'main-text.17'\u001b[0m,\n", - " \u001b[32m'main-text.18'\u001b[0m,\n", - " \u001b[32m'main-text.19'\u001b[0m,\n", - " \u001b[32m'main-text.20'\u001b[0m,\n", - " \u001b[32m'main-text.21'\u001b[0m,\n", - " \u001b[32m'main-text.22'\u001b[0m,\n", - " \u001b[32m'main-text.23'\u001b[0m,\n", - " \u001b[32m'main-text.24'\u001b[0m,\n", - " \u001b[32m'main-text.25'\u001b[0m,\n", - " \u001b[32m'main-text.26'\u001b[0m,\n", - " \u001b[32m'main-text.27'\u001b[0m,\n", - " \u001b[32m'main-text.28'\u001b[0m,\n", - " \u001b[32m'main-text.29'\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.11'\u001b[0m,\n", - " \u001b[32m'main-text.12'\u001b[0m,\n", - " \u001b[32m'main-text.13'\u001b[0m,\n", - " \u001b[32m'main-text.14'\u001b[0m,\n", - " \u001b[32m'main-text.15'\u001b[0m,\n", - " \u001b[32m'main-text.16'\u001b[0m,\n", - " \u001b[32m'main-text.17'\u001b[0m,\n", - " \u001b[32m'main-text.18'\u001b[0m,\n", - " \u001b[32m'main-text.19'\u001b[0m,\n", - " \u001b[32m'main-text.20'\u001b[0m,\n", - " \u001b[32m'main-text.21'\u001b[0m,\n", - " \u001b[32m'main-text.22'\u001b[0m,\n", - " \u001b[32m'main-text.23'\u001b[0m,\n", - " \u001b[32m'main-text.24'\u001b[0m,\n", - " \u001b[32m'main-text.25'\u001b[0m,\n", - " \u001b[32m'main-text.26'\u001b[0m,\n", - " \u001b[32m'main-text.27'\u001b[0m,\n", - " \u001b[32m'main-text.28'\u001b[0m,\n", - " \u001b[32m'main-text.29'\u001b[0m,\n", - " \u001b[32m'main-text.30'\u001b[0m,\n", - " \u001b[32m'main-text.31'\u001b[0m,\n", - " \u001b[32m'main-text.32'\u001b[0m,\n", - " \u001b[32m'main-text.33'\u001b[0m,\n", - " \u001b[32m'main-text.34'\u001b[0m,\n", - " \u001b[32m'main-text.35'\u001b[0m,\n", - " \u001b[32m'main-text.36'\u001b[0m,\n", - " \u001b[32m'main-text.37'\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Laboratories\\nIBM currently has 19 research facilities spread across 12 laboratories on six \u001b[0m\n", - "\u001b[32mcontinents: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m32\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\nAfrica \u001b[0m\u001b[32m(\u001b[0m\u001b[32mNairobi, Kenya, and Johannesburg, South Africa\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nAlmaden \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSan Jose\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nAustralia \u001b[0m\n", - "\u001b[32m(\u001b[0m\u001b[32mMelbourne\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nBrazil \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSao Paulo and Rio de Janeiro\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nCambridge-IBM Research and MIT-IBM Watson AI Lab \u001b[0m\u001b[32m(\u001b[0m\u001b[32mCambridge, \u001b[0m\n", - "\u001b[32mUS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nChina \u001b[0m\u001b[32m(\u001b[0m\u001b[32mBeijing\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nIsrael \u001b[0m\u001b[32m(\u001b[0m\u001b[32mHaifa\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nIreland \u001b[0m\u001b[32m(\u001b[0m\u001b[32mDublin\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nIndia \u001b[0m\u001b[32m(\u001b[0m\u001b[32mDelhi and Bengaluru\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nJapan \u001b[0m\u001b[32m(\u001b[0m\u001b[32mTokyo and \u001b[0m\n", - "\u001b[32mShin-kawasaki\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nSwitzerland \u001b[0m\u001b[32m(\u001b[0m\u001b[32mZurich\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nIBM Thomas J. Watson Research Center \u001b[0m\u001b[32m(\u001b[0m\u001b[32mYorktown Heights and Albany\u001b[0m\u001b[32m)\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.17'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.16'\u001b[0m,\n", - " \u001b[32m'main-text.17'\u001b[0m,\n", - " \u001b[32m'main-text.18'\u001b[0m,\n", - " \u001b[32m'main-text.19'\u001b[0m,\n", - " \u001b[32m'main-text.20'\u001b[0m,\n", - " \u001b[32m'main-text.21'\u001b[0m,\n", - " \u001b[32m'main-text.22'\u001b[0m,\n", - " \u001b[32m'main-text.23'\u001b[0m,\n", - " \u001b[32m'main-text.24'\u001b[0m,\n", - " \u001b[32m'main-text.25'\u001b[0m,\n", - " \u001b[32m'main-text.26'\u001b[0m,\n", - " \u001b[32m'main-text.27'\u001b[0m,\n", - " \u001b[32m'main-text.28'\u001b[0m,\n", - " \u001b[32m'main-text.29'\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m(\u001b[0m\u001b[32mRedirected from IBM Almaden Research Center\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\nIBM employees have garnered six Nobel Prizes, six\u001b[0m\n", - "\u001b[32mTuring Awards, 20 inductees into the U.S. National Inventors Hall of Fame, 19 National Medals of Technology, five \u001b[0m\n", - "\u001b[32mNational Medals of Science and three Kavli Prizes. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ As of 2018, the company has generated more patents than\u001b[0m\n", - "\u001b[32many other business in each of 25 consecutive years, which is a record. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m3\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.4'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.2'\u001b[0m, \u001b[32m'main-text.4'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Laboratories\\nHistoric research centers for IBM also include IBM La Gaude \u001b[0m\u001b[32m(\u001b[0m\u001b[32mNice\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, the Cambridge \u001b[0m\n", - "\u001b[32mScientific Center, the IBM New York Scientific Center, 330 North Wabash \u001b[0m\u001b[32m(\u001b[0m\u001b[32mChicago\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, IBM Austin Research Laboratory, \u001b[0m\n", - "\u001b[32mand IBM Laboratory Vienna. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m33\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.30'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.16'\u001b[0m, \u001b[32m'main-text.30'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"How many research labs does IBM have?\"\n", - "\n", - "# submit natural-language query on document\n", - "question_query = RAGQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " ## optional retrieval params\n", - " retr_k=RETR_K,\n", - " # text_weight=TEXT_WEIGHT,\n", - " rerank=RERANK,\n", - " ## optional generation params\n", - " # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n", - " # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n", - " # prompt_template=\"Answer the query based on the context.\\n\\nContext: {{ context }}\\n\\nQuery: {{ query }}\",\n", - " # gen_ctx_extr_method=\"window\",\n", - " # gen_ctx_window_size=5000,\n", - " # gen_ctx_window_lead_weight=0.5\n", - " # return_prompt=True,\n", - " # gen_timeout=10.0,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/6b70072911ad2794a3844dd44d1705a5ba37ca0b?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjI2YjcwMDcyOTExYWQyNzk0YTM4NDRkZDQ0ZDE3MDVhNWJhMzdjYTBiJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyYjMwYmM2NjdhMzI0YWUxMTFkMDI1NTI2NTYzYjY3NGE4ZDNmZDg2OWJjMDdjOGZkMjA0YWE5NWIwNWQ0MWYwYyU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMmIzMGJjNjY3YTMyNGFlMTExZDAyNTUyNjU2M2I2NzRhOGQzZmQ4NjliYzA3YzhmZDIwNGFhOTViMDVkNDFmMGMlMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0ExNyU3RCU3RA%3D%3D)." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "render_provenance_url(\n", - " api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Semantic retrieval\n", - "\n", - "Besides RAG, which includes natural language generation, a user may only be interested in\n", - "the semantic retrieval part.\n", - "\n", - "This can be obtained very similarly to RAG, as shown below:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SearchResult(\n",
-       "    search_result_items=[\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \n",
-       "Switzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \n",
-       "staffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \n",
-       "students and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\n",
-       "(formerly the Industry Solutions Lab), an executive briefing facility demonstrating technology prototypes and \n",
-       "solutions.',\n",
-       "            main_path='main-text.71',\n",
-       "            path_group=['main-text.69', 'main-text.71'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk='Switzerland\\nIBM Research-Zurich (previously called IBM Zurich Research Laboratory, ZRL) is the \n",
-       "European branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.',\n",
-       "            main_path='main-text.70',\n",
-       "            path_group=['main-text.69', 'main-text.70'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c',\n",
-       "            chunk=\"History\\nThe roots of today's IBM Research began with the 1945 opening of the Watson Scientific \n",
-       "Computing Laboratory at Columbia University. $^{[4]}$ This was the first IBM laboratory devoted to pure science and\n",
-       "later expanded into additional IBM Research locations in Westchester County, New York, starting in the 1950s, \n",
-       "$^{[5][6]}$ including the Thomas J. Watson Research Center in 1961. [5][6]\",\n",
-       "            main_path='main-text.8',\n",
-       "            path_group=['main-text.7', 'main-text.8'],\n",
-       "            source_is_text=True\n",
-       "        )\n",
-       "    ]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSearchResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIn 1956, IBM opened their first European research laboratory in Adliswil, \u001b[0m\n", - "\u001b[32mSwitzerland, near Zurich. The lab moved to its own campus in neighboring Rüschlikon in 1962. The Zurich lab is \u001b[0m\n", - "\u001b[32mstaffed by a multicultural and interdisciplinary team of a few hundred permanent research staff members, graduate \u001b[0m\n", - "\u001b[32mstudents and post-doctoral fellows, representing about 45 nationalities. Collocated with the lab is a Client Center\u001b[0m\n", - "\u001b[32m(\u001b[0m\u001b[32mformerly the Industry Solutions Lab\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, an executive briefing facility demonstrating technology prototypes and \u001b[0m\n", - "\u001b[32msolutions.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.71'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.71'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Switzerland\\nIBM Research-Zurich \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpreviously called IBM Zurich Research Laboratory, ZRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is the \u001b[0m\n", - "\u001b[32mEuropean branch of IBM Research. It was opened in 1956 and is located in Rüschlikon, near Zurich, Switzerland.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.70'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.69'\u001b[0m, \u001b[32m'main-text.70'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'b30bc667a324ae111d025526563b674a8d3fd869bc07c8fd204aa95b05d41f0c'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"History\u001b[0m\u001b[32m\\nThe roots of today's IBM Research began with the 1945 opening of the Watson Scientific \u001b[0m\n", - "\u001b[32mComputing Laboratory at Columbia University. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ This was the first IBM laboratory devoted to pure science and\u001b[0m\n", - "\u001b[32mlater expanded into additional IBM Research locations in Westchester County, New York, starting in the 1950s, \u001b[0m\n", - "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m5\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ including the Thomas J. Watson Research Center in 1961. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m5\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.8'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.7'\u001b[0m, \u001b[32m'main-text.8'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Where was the first European IBM lab located?\"\n", - "\n", - "# submit natural-language query on document\n", - "question_query = SemanticQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " ## optional params\n", - " retr_k=RETR_K,\n", - " # text_weight=TEXT_WEIGHT,\n", - " # rerank=RERANK,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = SearchResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## RAG on document not in semantically indexed collection" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prepare source" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "coords = ElasticProjectDataCollectionSource(\n", - " proj_key=PROJ_KEY,\n", - " index_key=SEM_OFF_IDX_KEY,\n", - ")\n", - "data_source = PrivateDataDocumentSource(\n", - " source=coords,\n", - " document_hash=SEM_OFF_IDX_DOC_HASH,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ingestion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the cell below we show how to semantically index a single document:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/pva/work/github.com/DS4SD/deepsearch-examples/.venv/lib/python3.10/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n", - " Expected `list[str]` but got `_LiteralGenericAlias` - serialized value may not be as expected\n", - " return self.__pydantic_serializer__.to_python(\n" - ] - }, - { - "data": { - "text/plain": [ - "{'ing_out': {}}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# launch the ingestion of the document for DocumentQA\n", - "task = api.documents.semantic_ingest(\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " skip_ingested_docs=False, # forcing re-indexing for the purpose of this example\n", - ")\n", - "\n", - "# wait for the ingestion task to finish\n", - "api.tasks.wait_for(PROJ_KEY, task.task_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once the document has been semantically ingested, we can run both RAG and semantic retrieval queries against it, as shown below." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### RAG" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RAGResult(\n",
-       "    answers=[\n",
-       "        RAGAnswerItem(\n",
-       "            answer='Magnavox introduced the Odyssey, the first video game console. $^{[10]}$',\n",
-       "            grounding=RAGGroundingInfo(\n",
-       "                retr_items=[\n",
-       "                    SearchResultItem(\n",
-       "                        doc_hash='029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3',\n",
-       "                        chunk='Magnavox\\nIn 1972 Magnavox introduced the Odyssey, the first video game console. \n",
-       "$^{[10]}$ In 1975, the Magnavox Company was acquired by Philips of the Netherlands to ensure nationwide \n",
-       "distribution for their VLP (later renamed LaserVision) Videodisc technology, and all Philips consumer electronics \n",
-       "in the US under the Norelco name began rebranding them under the Magnavox name; $^{[11]}$ Philips acquired the \n",
-       "similar-sounding company Philco in 1981, and Philips was able to freely use the Philips name, alternating with the \n",
-       "Magnavox name for some electronics, with the personal care business continuing to use the Norelco name.',\n",
-       "                        main_path='main-text.9',\n",
-       "                        path_group=['main-text.5', 'main-text.9'],\n",
-       "                        source_is_text=True\n",
-       "                    )\n",
-       "                ],\n",
-       "                gen_ctx_paths=[\n",
-       "                    'main-text.3',\n",
-       "                    'main-text.4',\n",
-       "                    'main-text.5',\n",
-       "                    'main-text.6',\n",
-       "                    'main-text.7',\n",
-       "                    'main-text.9',\n",
-       "                    'main-text.10',\n",
-       "                    'main-text.11',\n",
-       "                    'main-text.12',\n",
-       "                    'main-text.13',\n",
-       "                    'main-text.14',\n",
-       "                    'main-text.15',\n",
-       "                    'main-text.16',\n",
-       "                    'main-text.17'\n",
-       "                ]\n",
-       "            ),\n",
-       "            prompt=None\n",
-       "        )\n",
-       "    ],\n",
-       "    search_result_items=[\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3',\n",
-       "            chunk='Magnavox\\nIn 1972 Magnavox introduced the Odyssey, the first video game console. $^{[10]}$ In \n",
-       "1975, the Magnavox Company was acquired by Philips of the Netherlands to ensure nationwide distribution for their \n",
-       "VLP (later renamed LaserVision) Videodisc technology, and all Philips consumer electronics in the US under the \n",
-       "Norelco name began rebranding them under the Magnavox name; $^{[11]}$ Philips acquired the similar-sounding company\n",
-       "Philco in 1981, and Philips was able to freely use the Philips name, alternating with the Magnavox name for some \n",
-       "electronics, with the personal care business continuing to use the Norelco name.',\n",
-       "            main_path='main-text.9',\n",
-       "            path_group=['main-text.5', 'main-text.9'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3',\n",
-       "            chunk=\"Magnavox\\nMagnavox (Latin for 'great voice ', stylized as MAGNAVOX) is an American electronics \n",
-       "company that since 1975 has been a subsidiary of the Dutch electronics corporation Philips. [1] The predecessor to \n",
-       "Magnavox was founded in 1911 by Edwin Pridham and Peter L. Jensen, coinventors of the moving-coil loudspeaker at \n",
-       "their lab in Napa, California, under United States Patent number 1,105,924 for telephone receivers. $^{[2]}$ Six \n",
-       "decades later, Magnavox produced the Odyssey, the world's first home video game console.\",\n",
-       "            main_path='main-text.1',\n",
-       "            path_group=['main-text.0', 'main-text.1'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3',\n",
-       "            chunk='Magnavox\\nDuring the late 1970s the company released the Odyssey², in Europe also known as \n",
-       "Philips Videopac G7000.',\n",
-       "            main_path='main-text.12',\n",
-       "            path_group=['main-text.5', 'main-text.12'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3',\n",
-       "            chunk=\"History\\nJensen and Pridham founded the Commercial Wireless and Development Company in Napa, CA \n",
-       "in 1911, moving to San Francisco, and then Oakland in 1916. In July 1917, a merger with The Sonora Phonograph \n",
-       "Distributor Company was finalized and the Magnavox Company was born. Frank Morgan Steers was chosen as the \n",
-       "company's first President. Jensen moved on to found the Jensen Radio Manufacturing Company in Chicago, in the late \n",
-       "1920s. Pridham stayed on with Magnavox, which moved manufacturing to Fort Wayne, Indiana by the 1930s. The term \n",
-       "'Commercial Wireless ' had a different meaning in the early days of radio and telephone. Magnavox manufactured \n",
-       "radios, TVs, and phonographs. In the 1960s, Magnavox manufactured the first plasma displays for the military and \n",
-       "for computer applications.\",\n",
-       "            main_path='main-text.4',\n",
-       "            path_group=['main-text.3', 'main-text.4'],\n",
-       "            source_is_text=True\n",
-       "        )\n",
-       "    ]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswer\u001b[0m=\u001b[32m'Magnavox introduced the Odyssey, the first video game console. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$'\u001b[0m,\n", - " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Magnavox\\nIn 1972 Magnavox introduced the Odyssey, the first video game console. \u001b[0m\n", - "\u001b[32m$^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ In 1975, the Magnavox Company was acquired by Philips of the Netherlands to ensure nationwide \u001b[0m\n", - "\u001b[32mdistribution for their VLP \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlater renamed LaserVision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m Videodisc technology, and all Philips consumer electronics \u001b[0m\n", - "\u001b[32min the US under the Norelco name began rebranding them under the Magnavox name; $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Philips acquired the \u001b[0m\n", - "\u001b[32msimilar-sounding company Philco in 1981, and Philips was able to freely use the Philips name, alternating with the \u001b[0m\n", - "\u001b[32mMagnavox name for some electronics, with the personal care business continuing to use the Norelco name.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.9'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.9'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.3'\u001b[0m,\n", - " \u001b[32m'main-text.4'\u001b[0m,\n", - " \u001b[32m'main-text.5'\u001b[0m,\n", - " \u001b[32m'main-text.6'\u001b[0m,\n", - " \u001b[32m'main-text.7'\u001b[0m,\n", - " \u001b[32m'main-text.9'\u001b[0m,\n", - " \u001b[32m'main-text.10'\u001b[0m,\n", - " \u001b[32m'main-text.11'\u001b[0m,\n", - " \u001b[32m'main-text.12'\u001b[0m,\n", - " \u001b[32m'main-text.13'\u001b[0m,\n", - " \u001b[32m'main-text.14'\u001b[0m,\n", - " \u001b[32m'main-text.15'\u001b[0m,\n", - " \u001b[32m'main-text.16'\u001b[0m,\n", - " \u001b[32m'main-text.17'\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Magnavox\\nIn 1972 Magnavox introduced the Odyssey, the first video game console. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ In \u001b[0m\n", - "\u001b[32m1975, the Magnavox Company was acquired by Philips of the Netherlands to ensure nationwide distribution for their \u001b[0m\n", - "\u001b[32mVLP \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlater renamed LaserVision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m Videodisc technology, and all Philips consumer electronics in the US under the \u001b[0m\n", - "\u001b[32mNorelco name began rebranding them under the Magnavox name; $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Philips acquired the similar-sounding company\u001b[0m\n", - "\u001b[32mPhilco in 1981, and Philips was able to freely use the Philips name, alternating with the Magnavox name for some \u001b[0m\n", - "\u001b[32melectronics, with the personal care business continuing to use the Norelco name.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.9'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.9'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"Magnavox\u001b[0m\u001b[32m\\nMagnavox \u001b[0m\u001b[32m(\u001b[0m\u001b[32mLatin for 'great voice ', stylized as MAGNAVOX\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is an American electronics \u001b[0m\n", - "\u001b[32mcompany that since 1975 has been a subsidiary of the Dutch electronics corporation Philips. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The predecessor to \u001b[0m\n", - "\u001b[32mMagnavox was founded in 1911 by Edwin Pridham and Peter L. Jensen, coinventors of the moving-coil loudspeaker at \u001b[0m\n", - "\u001b[32mtheir lab in Napa, California, under United States Patent number 1,105,924 for telephone receivers. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Six \u001b[0m\n", - "\u001b[32mdecades later, Magnavox produced the Odyssey, the world's first home video game console.\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.1'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.0'\u001b[0m, \u001b[32m'main-text.1'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Magnavox\\nDuring the late 1970s the company released the Odyssey², in Europe also known as \u001b[0m\n", - "\u001b[32mPhilips Videopac G7000.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.12'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.12'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"History\u001b[0m\u001b[32m\\nJensen and Pridham founded the Commercial Wireless and Development Company in Napa, CA \u001b[0m\n", - "\u001b[32min 1911, moving to San Francisco, and then Oakland in 1916. In July 1917, a merger with The Sonora Phonograph \u001b[0m\n", - "\u001b[32mDistributor Company was finalized and the Magnavox Company was born. Frank Morgan Steers was chosen as the \u001b[0m\n", - "\u001b[32mcompany's first President. Jensen moved on to found the Jensen Radio Manufacturing Company in Chicago, in the late \u001b[0m\n", - "\u001b[32m1920s. Pridham stayed on with Magnavox, which moved manufacturing to Fort Wayne, Indiana by the 1930s. The term \u001b[0m\n", - "\u001b[32m'Commercial Wireless ' had a different meaning in the early days of radio and telephone. Magnavox manufactured \u001b[0m\n", - "\u001b[32mradios, TVs, and phonographs. In the 1960s, Magnavox manufactured the first plasma displays for the military and \u001b[0m\n", - "\u001b[32mfor computer applications.\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.4'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.3'\u001b[0m, \u001b[32m'main-text.4'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Which company created the first game console?\"\n", - "\n", - "# submit natural-language query on document\n", - "question_query = RAGQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " ## optional retrieval params\n", - " retr_k=4,\n", - " # text_weight=TEXT_WEIGHT,\n", - " rerank=RERANK,\n", - " ## optional generation params\n", - " # model_id=\"ibm-mistralai/mixtral-8x7b-instruct-v01-q\",\n", - " # gen_params={\"random_seed\": 42, \"max_new_tokens\": 1024},\n", - " # prompt_template=\"Answer the query based on the context.\\n\\nContext: {{ context }}\\n\\nQuery: {{ query }}\",\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "The provenance of the answer can be inspected on the [source document](https://sds.app.accelerate.science/projects/b09ae7561a01dc7c4b0fd21a43bfd93d140766d1/library/private/b4edbe66a8b8fe2ebed7e20d4d7b9335c48b45b0?search=JTdCJTIycHJpdmF0ZUNvbGxlY3Rpb24lMjIlM0ElMjJiNGVkYmU2NmE4YjhmZTJlYmVkN2UyMGQ0ZDdiOTMzNWM0OGI0NWIwJTIyJTJDJTIydHlwZSUyMiUzQSUyMkRvY3VtZW50JTIyJTJDJTIyZXhwcmVzc2lvbiUyMiUzQSUyMmZpbGUtaW5mby5kb2N1bWVudC1oYXNoJTNBJTIwJTVDJTIyMDI5MjEwZGY5MjljNzhlNzBkNzRlNmYxNDFhNDZkODMyNjkwNWNlNTg1NjJmMjA4MTgxOWM4MGMzOTIxZDVhMyU1QyUyMiUyMiUyQyUyMmZpbHRlcnMlMjIlM0ElNUIlNUQlMkMlMjJzZWxlY3QlMjIlM0ElNUIlMjJfbmFtZSUyMiUyQyUyMmRlc2NyaXB0aW9uLmNvbGxlY3Rpb24lMjIlMkMlMjJwcm92JTIyJTJDJTIyZGVzY3JpcHRpb24udGl0bGUlMjIlMkMlMjJkZXNjcmlwdGlvbi5wdWJsaWNhdGlvbl9kYXRlJTIyJTJDJTIyZGVzY3JpcHRpb24udXJsX3JlZnMlMjIlNUQlMkMlMjJpdGVtSW5kZXglMjIlM0EwJTJDJTIycGFnZVNpemUlMjIlM0ExMCUyQyUyMnNlYXJjaEFmdGVySGlzdG9yeSUyMiUzQSU1QiU1RCUyQyUyMnZpZXdUeXBlJTIyJTNBJTIyc25pcHBldHMlMjIlMkMlMjJyZWNvcmRTZWxlY3Rpb24lMjIlM0ElN0IlMjJyZWNvcmQlMjIlM0ElN0IlMjJpZCUyMiUzQSUyMjAyOTIxMGRmOTI5Yzc4ZTcwZDc0ZTZmMTQxYTQ2ZDgzMjY5MDVjZTU4NTYyZjIwODE4MTljODBjMzkyMWQ1YTMlMjIlN0QlMkMlMjJpdGVtSW5kZXglMjIlM0E5JTdEJTdE)." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "render_provenance_url(\n", - " api=api, coords=coords, retr_item=rag_result.answers[0].grounding.retr_items[0]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Semantic retrieval" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SearchResult(\n",
-       "    search_result_items=[\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3',\n",
-       "            chunk='Magnavox\\nIn 1972 Magnavox introduced the Odyssey, the first video game console. $^{[10]}$ In \n",
-       "1975, the Magnavox Company was acquired by Philips of the Netherlands to ensure nationwide distribution for their \n",
-       "VLP (later renamed LaserVision) Videodisc technology, and all Philips consumer electronics in the US under the \n",
-       "Norelco name began rebranding them under the Magnavox name; $^{[11]}$ Philips acquired the similar-sounding company\n",
-       "Philco in 1981, and Philips was able to freely use the Philips name, alternating with the Magnavox name for some \n",
-       "electronics, with the personal care business continuing to use the Norelco name.',\n",
-       "            main_path='main-text.9',\n",
-       "            path_group=['main-text.5', 'main-text.9'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3',\n",
-       "            chunk=\"Magnavox\\nMagnavox (Latin for 'great voice ', stylized as MAGNAVOX) is an American electronics \n",
-       "company that since 1975 has been a subsidiary of the Dutch electronics corporation Philips. [1] The predecessor to \n",
-       "Magnavox was founded in 1911 by Edwin Pridham and Peter L. Jensen, coinventors of the moving-coil loudspeaker at \n",
-       "their lab in Napa, California, under United States Patent number 1,105,924 for telephone receivers. $^{[2]}$ Six \n",
-       "decades later, Magnavox produced the Odyssey, the world's first home video game console.\",\n",
-       "            main_path='main-text.1',\n",
-       "            path_group=['main-text.0', 'main-text.1'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3',\n",
-       "            chunk='Magnavox\\nDuring the late 1970s the company released the Odyssey², in Europe also known as \n",
-       "Philips Videopac G7000.',\n",
-       "            main_path='main-text.12',\n",
-       "            path_group=['main-text.5', 'main-text.12'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3',\n",
-       "            chunk=\"History\\nJensen and Pridham founded the Commercial Wireless and Development Company in Napa, CA \n",
-       "in 1911, moving to San Francisco, and then Oakland in 1916. In July 1917, a merger with The Sonora Phonograph \n",
-       "Distributor Company was finalized and the Magnavox Company was born. Frank Morgan Steers was chosen as the \n",
-       "company's first President. Jensen moved on to found the Jensen Radio Manufacturing Company in Chicago, in the late \n",
-       "1920s. Pridham stayed on with Magnavox, which moved manufacturing to Fort Wayne, Indiana by the 1930s. The term \n",
-       "'Commercial Wireless ' had a different meaning in the early days of radio and telephone. Magnavox manufactured \n",
-       "radios, TVs, and phonographs. In the 1960s, Magnavox manufactured the first plasma displays for the military and \n",
-       "for computer applications.\",\n",
-       "            main_path='main-text.4',\n",
-       "            path_group=['main-text.3', 'main-text.4'],\n",
-       "            source_is_text=True\n",
-       "        )\n",
-       "    ]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSearchResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Magnavox\\nIn 1972 Magnavox introduced the Odyssey, the first video game console. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ In \u001b[0m\n", - "\u001b[32m1975, the Magnavox Company was acquired by Philips of the Netherlands to ensure nationwide distribution for their \u001b[0m\n", - "\u001b[32mVLP \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlater renamed LaserVision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m Videodisc technology, and all Philips consumer electronics in the US under the \u001b[0m\n", - "\u001b[32mNorelco name began rebranding them under the Magnavox name; $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Philips acquired the similar-sounding company\u001b[0m\n", - "\u001b[32mPhilco in 1981, and Philips was able to freely use the Philips name, alternating with the Magnavox name for some \u001b[0m\n", - "\u001b[32melectronics, with the personal care business continuing to use the Norelco name.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.9'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.9'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"Magnavox\u001b[0m\u001b[32m\\nMagnavox \u001b[0m\u001b[32m(\u001b[0m\u001b[32mLatin for 'great voice ', stylized as MAGNAVOX\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is an American electronics \u001b[0m\n", - "\u001b[32mcompany that since 1975 has been a subsidiary of the Dutch electronics corporation Philips. \u001b[0m\u001b[32m[\u001b[0m\u001b[32m1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The predecessor to \u001b[0m\n", - "\u001b[32mMagnavox was founded in 1911 by Edwin Pridham and Peter L. Jensen, coinventors of the moving-coil loudspeaker at \u001b[0m\n", - "\u001b[32mtheir lab in Napa, California, under United States Patent number 1,105,924 for telephone receivers. $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$ Six \u001b[0m\n", - "\u001b[32mdecades later, Magnavox produced the Odyssey, the world's first home video game console.\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.1'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.0'\u001b[0m, \u001b[32m'main-text.1'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'Magnavox\\nDuring the late 1970s the company released the Odyssey², in Europe also known as \u001b[0m\n", - "\u001b[32mPhilips Videopac G7000.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.12'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.5'\u001b[0m, \u001b[32m'main-text.12'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'029210df929c78e70d74e6f141a46d8326905ce58562f2081819c80c3921d5a3'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"History\u001b[0m\u001b[32m\\nJensen and Pridham founded the Commercial Wireless and Development Company in Napa, CA \u001b[0m\n", - "\u001b[32min 1911, moving to San Francisco, and then Oakland in 1916. In July 1917, a merger with The Sonora Phonograph \u001b[0m\n", - "\u001b[32mDistributor Company was finalized and the Magnavox Company was born. Frank Morgan Steers was chosen as the \u001b[0m\n", - "\u001b[32mcompany's first President. Jensen moved on to found the Jensen Radio Manufacturing Company in Chicago, in the late \u001b[0m\n", - "\u001b[32m1920s. Pridham stayed on with Magnavox, which moved manufacturing to Fort Wayne, Indiana by the 1930s. The term \u001b[0m\n", - "\u001b[32m'Commercial Wireless ' had a different meaning in the early days of radio and telephone. Magnavox manufactured \u001b[0m\n", - "\u001b[32mradios, TVs, and phonographs. In the 1960s, Magnavox manufactured the first plasma displays for the military and \u001b[0m\n", - "\u001b[32mfor computer applications.\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.4'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.3'\u001b[0m, \u001b[32m'main-text.4'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"Which company created the first game console?\"\n", - "\n", - "# submit natural-language query on document\n", - "question_query = SemanticQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " ## optional params\n", - " retr_k=4,\n", - " # text_weight=TEXT_WEIGHT,\n", - " rerank=RERANK,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = SearchResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## QA on document from public collection" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prepare source" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n", - "from deepsearch.cps.client.components.documents import PublicDataDocumentSource\n", - "\n", - "index_key = \"acl\"\n", - "document_hash = \"0002e4fc1cef1c98b411c75e484db0a3d32f6bc1b4058e2985e5f377721761fb\"\n", - "\n", - "coords = ElasticDataCollectionSource(\n", - " elastic_id=\"default\",\n", - " index_key=index_key,\n", - ")\n", - "data_source = PublicDataDocumentSource(\n", - " source=coords,\n", - " document_hash=document_hash,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "9e959bf3", - "metadata": {}, - "source": [ - "### Ingestion" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "ec5a6a3b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'ing_out': {}}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# launch the ingestion of the document for DocumentQA\n", - "task = api.documents.semantic_ingest(\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " skip_ingested_docs=False, # forcing re-indexing for the purpose of this example\n", - ")\n", - "\n", - "# wait for the ingestion task to finish\n", - "api.tasks.wait_for(PROJ_KEY, task.task_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### RAG" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RAGResult(\n",
-       "    answers=[\n",
-       "        RAGAnswerItem(\n",
-       "            answer='A player can achieve 28 game goals in the MP game.',\n",
-       "            grounding=RAGGroundingInfo(\n",
-       "                retr_items=[\n",
-       "                    SearchResultItem(\n",
-       "                        doc_hash='0002e4fc1cef1c98b411c75e484db0a3d32f6bc1b4058e2985e5f377721761fb',\n",
-       "                        chunk=\"2.1 The MP Game and Dialogs\\nIn total, the player can achieve up to 28 game goals by\n",
-       "conducting 12 separate dialogs in various parts of the virtual world. Each of the 12 dialogs in the MP game helps \n",
-       "players to achieve the game goals. The player interacts with the virtual characters to obtain information that \n",
-       "helps her to achieve these goals and, as a consequence, to increase her score in the game. Table 1 summarises the \n",
-       "game goals and the contextual parameters (player's role, location in the virtual world, VCs present) associated \n",
-       "with each dialog.\",\n",
-       "                        main_path='main-text.24',\n",
-       "                        path_group=['main-text.22', 'main-text.24'],\n",
-       "                        source_is_text=True\n",
-       "                    )\n",
-       "                ],\n",
-       "                gen_ctx_paths=[\n",
-       "                    'main-text.16',\n",
-       "                    'main-text.17',\n",
-       "                    'main-text.19',\n",
-       "                    'main-text.20',\n",
-       "                    'main-text.21',\n",
-       "                    'main-text.22',\n",
-       "                    'main-text.23',\n",
-       "                    'main-text.24',\n",
-       "                    'main-text.25',\n",
-       "                    'main-text.26',\n",
-       "                    'main-text.28',\n",
-       "                    'main-text.29',\n",
-       "                    'main-text.30',\n",
-       "                    'main-text.31',\n",
-       "                    'main-text.32',\n",
-       "                    'main-text.33',\n",
-       "                    'main-text.34',\n",
-       "                    'main-text.35'\n",
-       "                ]\n",
-       "            ),\n",
-       "            prompt=None\n",
-       "        )\n",
-       "    ],\n",
-       "    search_result_items=[\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='0002e4fc1cef1c98b411c75e484db0a3d32f6bc1b4058e2985e5f377721761fb',\n",
-       "            chunk=\"2.1 The MP Game and Dialogs\\nIn total, the player can achieve up to 28 game goals by conducting \n",
-       "12 separate dialogs in various parts of the virtual world. Each of the 12 dialogs in the MP game helps players to \n",
-       "achieve the game goals. The player interacts with the virtual characters to obtain information that helps her to \n",
-       "achieve these goals and, as a consequence, to increase her score in the game. Table 1 summarises the game goals and\n",
-       "the contextual parameters (player's role, location in the virtual world, VCs present) associated with each \n",
-       "dialog.\",\n",
-       "            main_path='main-text.24',\n",
-       "            path_group=['main-text.22', 'main-text.24'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='0002e4fc1cef1c98b411c75e484db0a3d32f6bc1b4058e2985e5f377721761fb',\n",
-       "            chunk='2.1 The MP Game and Dialogs\\nThe MP game is a multi-player quest where 3 teenagers seek to build\n",
-       "a joystick in order to free their uncle trapped in a video game $^{1}$. To build this joystick, the player (who \n",
-       "alternatively represents anyone of these three teenagers) must explore the plastic factory and achieve 17 mandatory\n",
-       "goals (find the plans, get the appropriate mould, retrieve some plastic from the storing shed, etc), as well as 11 \n",
-       "optional goals which, when reached, provide them with extra information about the plastic industry (and therefore \n",
-       "increases their knowledge of it).',\n",
-       "            main_path='main-text.23',\n",
-       "            path_group=['main-text.22', 'main-text.23'],\n",
-       "            source_is_text=True\n",
-       "        ),\n",
-       "        SearchResultItem(\n",
-       "            doc_hash='0002e4fc1cef1c98b411c75e484db0a3d32f6bc1b4058e2985e5f377721761fb',\n",
-       "            chunk='3 Dialogue Systems\\n1, VC = Lucas. 1, Player = Ben. 1, Goals = Find the address of the \n",
-       "enterprise.. 1, Location = Uncle’s place.. 2, VC = M.Jasper. 2, Player = Lucas. 2, Goals = The manufacturing first \n",
-       "step. 2, Location = Enterprise reception. 3, VC = Samir. 3, Player = Julie. 3, Goals = Find the plans of the \n",
-       "joystick Optional: job, staff, studies, security policies. 3, Location = Designing Office. 4, VC = Samir. 4, Player\n",
-       "= Julie. 4, Goals = Find out what to do next Optional: jobs in the enterprise, staff in the enterprise. 4, Location\n",
-       "= Designing Office. 5, VC = Melissa. 5, Player = Lucas. 5, Goals = Find the mould, optional where are the moulds. \n",
-       "5, Location = Plant. 6, VC = Melissa. 6, Player = Lucas. 6, Goals = Find the right machine. 6, Location = Plant. 7,\n",
-       "VC = Melissa. 7, Player = Lucas. 7, Goals = Confirm you have found the right mould and machine and find out what to\n",
-       "do next. 7, Location = Plant. 8, VC = Operator. 8, Player = Julie. 8, Goals = Knowing about the material space and \n",
-       "about the job Optional: find out what to do in the case of failure helping to feed a machine with the right \n",
-       "material. 8, Location = Material Space. 9, VC = Serge. 9, Player = Ben. 9, Goals = Perform quality tests. Optional:\n",
-       "VC’s job. 9, Location = Laboratory Tests. 10, VC = Serge. 10, Player = Ben. 10, Goals = Find out what to do next. \n",
-       "Optional: know what happens with broken items. 10, Location = Laboratory Tests. 11, VC = Sophia. 11, Player = \n",
-       "Julie. 11, Goals = Find the electronic components, knowing about VC’s job. 11, Location = Finishing. 12, VC = \n",
-       "Sophia. 12, Player = Lucas. 12, Goals = Finishing process Optional: know about conditioning the product. 12, \n",
-       "Location = Finishing',\n",
-       "            main_path='main-text.38',\n",
-       "            path_group=['main-text.35', 'main-text.38'],\n",
-       "            source_is_text=False\n",
-       "        )\n",
-       "    ]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mRAGResult\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswers\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mRAGAnswerItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33manswer\u001b[0m=\u001b[32m'A player can achieve 28 game goals in the MP game.'\u001b[0m,\n", - " \u001b[33mgrounding\u001b[0m=\u001b[1;35mRAGGroundingInfo\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mretr_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'0002e4fc1cef1c98b411c75e484db0a3d32f6bc1b4058e2985e5f377721761fb'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"2\u001b[0m\u001b[32m.1 The MP Game and Dialogs\\nIn total, the player can achieve up to 28 game goals by\u001b[0m\n", - "\u001b[32mconducting 12 separate dialogs in various parts of the virtual world. Each of the 12 dialogs in the MP game helps \u001b[0m\n", - "\u001b[32mplayers to achieve the game goals. The player interacts with the virtual characters to obtain information that \u001b[0m\n", - "\u001b[32mhelps her to achieve these goals and, as a consequence, to increase her score in the game. Table 1 summarises the \u001b[0m\n", - "\u001b[32mgame goals and the contextual parameters \u001b[0m\u001b[32m(\u001b[0m\u001b[32mplayer's role, location in the virtual world, VCs present\u001b[0m\u001b[32m)\u001b[0m\u001b[32m associated \u001b[0m\n", - "\u001b[32mwith each dialog.\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.24'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.22'\u001b[0m, \u001b[32m'main-text.24'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33mgen_ctx_paths\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[32m'main-text.16'\u001b[0m,\n", - " \u001b[32m'main-text.17'\u001b[0m,\n", - " \u001b[32m'main-text.19'\u001b[0m,\n", - " \u001b[32m'main-text.20'\u001b[0m,\n", - " \u001b[32m'main-text.21'\u001b[0m,\n", - " \u001b[32m'main-text.22'\u001b[0m,\n", - " \u001b[32m'main-text.23'\u001b[0m,\n", - " \u001b[32m'main-text.24'\u001b[0m,\n", - " \u001b[32m'main-text.25'\u001b[0m,\n", - " \u001b[32m'main-text.26'\u001b[0m,\n", - " \u001b[32m'main-text.28'\u001b[0m,\n", - " \u001b[32m'main-text.29'\u001b[0m,\n", - " \u001b[32m'main-text.30'\u001b[0m,\n", - " \u001b[32m'main-text.31'\u001b[0m,\n", - " \u001b[32m'main-text.32'\u001b[0m,\n", - " \u001b[32m'main-text.33'\u001b[0m,\n", - " \u001b[32m'main-text.34'\u001b[0m,\n", - " \u001b[32m'main-text.35'\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[33mprompt\u001b[0m=\u001b[3;35mNone\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[33msearch_result_items\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'0002e4fc1cef1c98b411c75e484db0a3d32f6bc1b4058e2985e5f377721761fb'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m\"2\u001b[0m\u001b[32m.1 The MP Game and Dialogs\\nIn total, the player can achieve up to 28 game goals by conducting \u001b[0m\n", - "\u001b[32m12 separate dialogs in various parts of the virtual world. Each of the 12 dialogs in the MP game helps players to \u001b[0m\n", - "\u001b[32machieve the game goals. The player interacts with the virtual characters to obtain information that helps her to \u001b[0m\n", - "\u001b[32machieve these goals and, as a consequence, to increase her score in the game. Table 1 summarises the game goals and\u001b[0m\n", - "\u001b[32mthe contextual parameters \u001b[0m\u001b[32m(\u001b[0m\u001b[32mplayer's role, location in the virtual world, VCs present\u001b[0m\u001b[32m)\u001b[0m\u001b[32m associated with each \u001b[0m\n", - "\u001b[32mdialog.\"\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.24'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.22'\u001b[0m, \u001b[32m'main-text.24'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'0002e4fc1cef1c98b411c75e484db0a3d32f6bc1b4058e2985e5f377721761fb'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'2.1 The MP Game and Dialogs\\nThe MP game is a multi-player quest where 3 teenagers seek to build\u001b[0m\n", - "\u001b[32ma joystick in order to free their uncle trapped in a video game $^\u001b[0m\u001b[32m{\u001b[0m\u001b[32m1\u001b[0m\u001b[32m}\u001b[0m\u001b[32m$. To build this joystick, the player \u001b[0m\u001b[32m(\u001b[0m\u001b[32mwho \u001b[0m\n", - "\u001b[32malternatively represents anyone of these three teenagers\u001b[0m\u001b[32m)\u001b[0m\u001b[32m must explore the plastic factory and achieve 17 mandatory\u001b[0m\n", - "\u001b[32mgoals \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfind the plans, get the appropriate mould, retrieve some plastic from the storing shed, etc\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, as well as 11 \u001b[0m\n", - "\u001b[32moptional goals which, when reached, provide them with extra information about the plastic industry \u001b[0m\u001b[32m(\u001b[0m\u001b[32mand therefore \u001b[0m\n", - "\u001b[32mincreases their knowledge of it\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.23'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.22'\u001b[0m, \u001b[32m'main-text.23'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSearchResultItem\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mdoc_hash\u001b[0m=\u001b[32m'0002e4fc1cef1c98b411c75e484db0a3d32f6bc1b4058e2985e5f377721761fb'\u001b[0m,\n", - " \u001b[33mchunk\u001b[0m=\u001b[32m'3 Dialogue Systems\\n1, VC = Lucas. 1, Player = Ben. 1, Goals = Find the address of the \u001b[0m\n", - "\u001b[32menterprise.. 1, Location = Uncle’s place.. 2, VC = M.Jasper. 2, Player = Lucas. 2, Goals = The manufacturing first \u001b[0m\n", - "\u001b[32mstep. 2, Location = Enterprise reception. 3, VC = Samir. 3, Player = Julie. 3, Goals = Find the plans of the \u001b[0m\n", - "\u001b[32mjoystick Optional: job, staff, studies, security policies. 3, Location = Designing Office. 4, VC = Samir. 4, Player\u001b[0m\n", - "\u001b[32m= Julie. 4, Goals = Find out what to do next Optional: jobs in the enterprise, staff in the enterprise. 4, Location\u001b[0m\n", - "\u001b[32m= Designing Office. 5, VC = Melissa. 5, Player = Lucas. 5, Goals = Find the mould, optional where are the moulds. \u001b[0m\n", - "\u001b[32m5, Location = Plant. 6, VC = Melissa. 6, Player = Lucas. 6, Goals = Find the right machine. 6, Location = Plant. 7,\u001b[0m\n", - "\u001b[32mVC = Melissa. 7, Player = Lucas. 7, Goals = Confirm you have found the right mould and machine and find out what to\u001b[0m\n", - "\u001b[32mdo next. 7, Location = Plant. 8, VC = Operator. 8, Player = Julie. 8, Goals = Knowing about the material space and \u001b[0m\n", - "\u001b[32mabout the job Optional: find out what to do in the case of failure helping to feed a machine with the right \u001b[0m\n", - "\u001b[32mmaterial. 8, Location = Material Space. 9, VC = Serge. 9, Player = Ben. 9, Goals = Perform quality tests. Optional:\u001b[0m\n", - "\u001b[32mVC’s job. 9, Location = Laboratory Tests. 10, VC = Serge. 10, Player = Ben. 10, Goals = Find out what to do next. \u001b[0m\n", - "\u001b[32mOptional: know what happens with broken items. 10, Location = Laboratory Tests. 11, VC = Sophia. 11, Player = \u001b[0m\n", - "\u001b[32mJulie. 11, Goals = Find the electronic components, knowing about VC’s job. 11, Location = Finishing. 12, VC = \u001b[0m\n", - "\u001b[32mSophia. 12, Player = Lucas. 12, Goals = Finishing process Optional: know about conditioning the product. 12, \u001b[0m\n", - "\u001b[32mLocation = Finishing'\u001b[0m,\n", - " \u001b[33mmain_path\u001b[0m=\u001b[32m'main-text.38'\u001b[0m,\n", - " \u001b[33mpath_group\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'main-text.35'\u001b[0m, \u001b[32m'main-text.38'\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33msource_is_text\u001b[0m=\u001b[3;91mFalse\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "question = \"How many goals can a player achieve in the MP game?\"\n", - "\n", - "# submit natural-language query on document\n", - "question_query = RAGQuery(\n", - " question=question,\n", - " project=PROJ_KEY,\n", - " data_source=data_source,\n", - " ## optional retrieval params\n", - " retr_k=RETR_K,\n", - ")\n", - "api_output = api.queries.run(question_query)\n", - "rag_result = RAGResult.from_api_output(api_output, raise_on_error=RAISE)\n", - "\n", - "rich.print(rag_result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}