From 4d456e27f9e64172a70410efc283f73a02837546 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 4 Jan 2024 12:16:30 +0100 Subject: [PATCH] ML/LLM: RAG example copy edits --- .../cratedb_rag_customer_support.ipynb | 1281 +++++------------ .../llm-langchain/pyproject.toml | 3 + 2 files changed, 369 insertions(+), 915 deletions(-) diff --git a/topic/machine-learning/llm-langchain/cratedb_rag_customer_support.ipynb b/topic/machine-learning/llm-langchain/cratedb_rag_customer_support.ipynb index ca9f167f..fd1ef21e 100644 --- a/topic/machine-learning/llm-langchain/cratedb_rag_customer_support.ipynb +++ b/topic/machine-learning/llm-langchain/cratedb_rag_customer_support.ipynb @@ -1,924 +1,375 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "This notebook shows how to use the CrateDB vector store functionality around FLOAT_VECTOR and KNN_MATCH. You will learn how to use it to create a retrieval augmented generation (RAG) pipeline." + ], + "metadata": { + "id": "rUPQQ-jlMkUd" + } + }, + { + "cell_type": "markdown", + "source": [ + "## What is CrateDB?\n", + "\n", + "CrateDB is an open-source, distributed, and scalable SQL analytics database for storing and analyzing massive amounts of data in near real-time, even with complex queries. It is wire-compatible to PostgreSQL, based on Lucene, and inherits the shared-nothing distribution layer of Elasticsearch.\n", + "\n", + "This example uses the Python client driver for CrateDB." + ], + "metadata": { + "id": "Pe-5yxFDMl0S" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Getting Started\n", + "CrateDB supports storing vectors since version 5.5. You can leverage the fully managed service of CrateDB Cloud, or install CrateDB on your own, for example using Docker.\n", + "\n", + "```shell\n", + "docker run --publish 4200:4200 --publish 5432:5432 --pull=always crate:latest -Cdiscovery.type=single-node\n", + "```\n", + "\n", + "## Setup\n", + "\n", + "Install required Python packages, and import Python modules." + ], + "metadata": { + "id": "rE-UtZJnMs2q" + } + }, + { + "cell_type": "code", + "source": [ + "#!pip install -r requirements.txt" + ], + "metadata": { "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python" - } + "id": "RJyP1GEXNHUy", + "outputId": "9c62258f-f6a1-4578-ced4-40f15f586e9a" + }, + "execution_count": null, + "outputs": [] }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "This notebook shows how to use the CrateDB vector store functionality around FLOAT_VECTOR and KNN_MATCH. You will learn how to use it to create a retrieval augmented generation (RAG) pipeline." - ], - "metadata": { - "id": "rUPQQ-jlMkUd" - } - }, - { - "cell_type": "markdown", - "source": [ - "#What is CrateDB?\n", - "CrateDB is an open-source, distributed, and scalable SQL analytics database for storing and analyzing massive amounts of data in near real-time, even with complex queries. It is wire-compatible to PostgreSQL, based on Lucene, and inherits the shared-nothing distribution layer of Elasticsearch.\n", - "\n", - "This example uses the Python client driver for CrateDB." - ], - "metadata": { - "id": "Pe-5yxFDMl0S" - } - }, - { - "cell_type": "markdown", - "source": [ - "#Getting Started\n", - "CrateDB supports storing vectors since version 5.5. You can leverage the fully managed service of CrateDB Cloud, or install CrateDB on your own, for example using Docker.\n", - "\n", - "`docker run --publish 4200:4200 --publish 5432:5432 --pull=always crate:latest -Cdiscovery.type=single-node`\n", - "\n", - "Install required Python packages, and import Python modules." - ], - "metadata": { - "id": "rE-UtZJnMs2q" - } - }, - { - "cell_type": "code", - "source": [ - "pip install langchain pypdf chromadb openai sentence_transformers sqlalchemy 'crate[sqlalchemy]' tiktoken" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "RJyP1GEXNHUy", - "outputId": "9c62258f-f6a1-4578-ced4-40f15f586e9a" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting langchain\n", - " Downloading langchain-0.0.352-py3-none-any.whl (794 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m794.4/794.4 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting pypdf\n", - " Downloading pypdf-3.17.3-py3-none-any.whl (277 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m277.9/277.9 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting chromadb\n", - " Downloading chromadb-0.4.21-py3-none-any.whl (508 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m508.6/508.6 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting openai\n", - " Downloading openai-1.6.0-py3-none-any.whl (225 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.4/225.4 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting sentence_transformers\n", - " Downloading sentence-transformers-2.2.2.tar.gz (85 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.0/86.0 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.10/dist-packages (2.0.23)\n", - "Collecting crate[sqlalchemy]\n", - " Downloading crate-0.34.0-py2.py3-none-any.whl (117 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.5/117.5 kB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting tiktoken\n", - " Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n", - "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.1)\n", - "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", - "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", - " Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)\n", - "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n", - " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", - "Collecting langchain-community<0.1,>=0.0.2 (from langchain)\n", - " Downloading langchain_community-0.0.5-py3-none-any.whl (1.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m20.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting langchain-core<0.2,>=0.1 (from langchain)\n", - " Downloading langchain_core-0.1.2-py3-none-any.whl (191 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m191.5/191.5 kB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting langsmith<0.1.0,>=0.0.70 (from langchain)\n", - " Downloading langsmith-0.0.72-py3-none-any.whl (46 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.3/46.3 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n", - "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.10.13)\n", - "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n", - "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n", - "Collecting chroma-hnswlib==0.7.3 (from chromadb)\n", - " Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m26.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting fastapi>=0.95.2 (from chromadb)\n", - " Downloading fastapi-0.105.0-py3-none-any.whl (93 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.1/93.1 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)\n", - " Downloading uvicorn-0.25.0-py3-none-any.whl (60 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting posthog>=2.4.0 (from chromadb)\n", - " Downloading posthog-3.1.0-py2.py3-none-any.whl (37 kB)\n", - "Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (4.5.0)\n", - "Collecting pulsar-client>=3.1.0 (from chromadb)\n", - " Downloading pulsar_client-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m35.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting onnxruntime>=1.14.1 (from chromadb)\n", - " Downloading onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m49.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting opentelemetry-api>=1.2.0 (from chromadb)\n", - " Downloading opentelemetry_api-1.22.0-py3-none-any.whl (57 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.9/57.9 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)\n", - " Downloading opentelemetry_exporter_otlp_proto_grpc-1.22.0-py3-none-any.whl (18 kB)\n", - "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n", - " Downloading opentelemetry_instrumentation_fastapi-0.43b0-py3-none-any.whl (11 kB)\n", - "Collecting opentelemetry-sdk>=1.2.0 (from chromadb)\n", - " Downloading opentelemetry_sdk-1.22.0-py3-none-any.whl (105 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.6/105.6 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: tokenizers>=0.13.2 in /usr/local/lib/python3.10/dist-packages (from chromadb) (0.15.0)\n", - "Collecting pypika>=0.48.9 (from chromadb)\n", - " Downloading PyPika-0.48.9.tar.gz (67 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: tqdm>=4.65.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (4.66.1)\n", - "Collecting overrides>=7.3.1 (from chromadb)\n", - " Downloading overrides-7.4.0-py3-none-any.whl (17 kB)\n", - "Requirement already satisfied: importlib-resources in /usr/local/lib/python3.10/dist-packages (from chromadb) (6.1.1)\n", - "Requirement already satisfied: grpcio>=1.58.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (1.60.0)\n", - "Collecting bcrypt>=4.0.1 (from chromadb)\n", - " Downloading bcrypt-4.1.2-cp39-abi3-manylinux_2_28_x86_64.whl (698 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m698.9/698.9 kB\u001b[0m \u001b[31m61.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: typer>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (0.9.0)\n", - "Collecting kubernetes>=28.1.0 (from chromadb)\n", - " Downloading kubernetes-28.1.0-py2.py3-none-any.whl (1.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m52.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting mmh3>=4.0.1 (from chromadb)\n", - " Downloading mmh3-4.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (72 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.6/72.6 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai) (1.7.0)\n", - "Collecting httpx<1,>=0.23.0 (from openai)\n", - " Downloading httpx-0.26.0-py3-none-any.whl (75 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai) (1.3.0)\n", - "Collecting typing-extensions>=4.5.0 (from chromadb)\n", - " Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n", - "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (4.35.2)\n", - "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (2.1.0+cu121)\n", - "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (0.16.0+cu121)\n", - "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (1.2.2)\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (1.11.4)\n", - "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (3.8.1)\n", - "Collecting sentencepiece (from sentence_transformers)\n", - " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m52.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (0.19.4)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy) (3.0.2)\n", - "Requirement already satisfied: urllib3<2.1 in /usr/local/lib/python3.10/dist-packages (from crate[sqlalchemy]) (2.0.7)\n", - "Collecting geojson<4,>=2.5.0 (from crate[sqlalchemy])\n", - " Downloading geojson-3.1.0-py3-none-any.whl (15 kB)\n", - "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2023.6.3)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", - "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (3.6)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (1.2.0)\n", - "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", - "Collecting starlette<0.28.0,>=0.27.0 (from fastapi>=0.95.2->chromadb)\n", - " Downloading starlette-0.27.0-py3-none-any.whl (66 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai) (2023.11.17)\n", - "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)\n", - " Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)\n", - " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.13.1)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n", - "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (23.2)\n", - "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain)\n", - " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n", - "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\n", - "Requirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.8.2)\n", - "Requirement already satisfied: google-auth>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.17.3)\n", - "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.7.0)\n", - "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.3.1)\n", - "Requirement already satisfied: oauthlib>=3.2.2 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\n", - "Collecting urllib3<2.1 (from crate[sqlalchemy])\n", - " Downloading urllib3-1.26.18-py2.py3-none-any.whl (143 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.8/143.8 kB\u001b[0m \u001b[31m20.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting coloredlogs (from onnxruntime>=1.14.1->chromadb)\n", - " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.14.1->chromadb) (23.5.26)\n", - "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.14.1->chromadb) (3.20.3)\n", - "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from onnxruntime>=1.14.1->chromadb) (1.12)\n", - "Collecting deprecated>=1.2.6 (from opentelemetry-api>=1.2.0->chromadb)\n", - " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n", - "Collecting importlib-metadata<7.0,>=6.0 (from opentelemetry-api>=1.2.0->chromadb)\n", - " Downloading importlib_metadata-6.11.0-py3-none-any.whl (23 kB)\n", - "Collecting backoff<3.0.0,>=1.10.0 (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb)\n", - " Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n", - "Requirement already satisfied: googleapis-common-protos~=1.52 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.62.0)\n", - "Collecting opentelemetry-exporter-otlp-proto-common==1.22.0 (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb)\n", - " Downloading opentelemetry_exporter_otlp_proto_common-1.22.0-py3-none-any.whl (17 kB)\n", - "Collecting opentelemetry-proto==1.22.0 (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb)\n", - " Downloading opentelemetry_proto-1.22.0-py3-none-any.whl (50 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting opentelemetry-instrumentation-asgi==0.43b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", - " Downloading opentelemetry_instrumentation_asgi-0.43b0-py3-none-any.whl (14 kB)\n", - "Collecting opentelemetry-instrumentation==0.43b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", - " Downloading opentelemetry_instrumentation-0.43b0-py3-none-any.whl (28 kB)\n", - "Collecting opentelemetry-semantic-conventions==0.43b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", - " Downloading opentelemetry_semantic_conventions-0.43b0-py3-none-any.whl (36 kB)\n", - "Collecting opentelemetry-util-http==0.43b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", - " Downloading opentelemetry_util_http-0.43b0-py3-none-any.whl (6.9 kB)\n", - "Requirement already satisfied: setuptools>=16.0 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-instrumentation==0.43b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (67.7.2)\n", - "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-instrumentation==0.43b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.14.1)\n", - "Collecting asgiref~=3.0 (from opentelemetry-instrumentation-asgi==0.43b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", - " Downloading asgiref-3.7.2-py3-none-any.whl (24 kB)\n", - "Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb)\n", - " Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence_transformers) (3.2.1)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence_transformers) (3.1.2)\n", - "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence_transformers) (2.1.0)\n", - "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.4.1)\n", - "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.10/dist-packages (from typer>=0.9.0->chromadb) (8.1.7)\n", - "Collecting httptools>=0.5.0 (from uvicorn[standard]>=0.18.3->chromadb)\n", - " Downloading httptools-0.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (341 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m35.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting python-dotenv>=0.13 (from uvicorn[standard]>=0.18.3->chromadb)\n", - " Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)\n", - "Collecting uvloop!=0.15.0,!=0.15.1,>=0.14.0 (from uvicorn[standard]>=0.18.3->chromadb)\n", - " Downloading uvloop-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m61.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting watchfiles>=0.13 (from uvicorn[standard]>=0.18.3->chromadb)\n", - " Downloading watchfiles-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m75.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting websockets>=10.4 (from uvicorn[standard]>=0.18.3->chromadb)\n", - " Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence_transformers) (1.3.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence_transformers) (3.2.0)\n", - "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->sentence_transformers) (9.4.0)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.3.2)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.3.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<7.0,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.17.0)\n", - "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", - "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.14.1->chromadb)\n", - " Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->sentence_transformers) (2.1.3)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\n", - "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.5.1)\n", - "Building wheels for collected packages: sentence_transformers, pypika\n", - " Building wheel for sentence_transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=4dba6c5f23884559a8be69462a03c893eb469e85ec77b835b30a55164465006b\n", - " Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f\n", - " Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for pypika: filename=PyPika-0.48.9-py2.py3-none-any.whl size=53723 sha256=7449bcabb799137847f21b45c165fc24d48392abca2a4ef4fef4987bf92b2399\n", - " Stored in directory: /root/.cache/pip/wheels/e1/26/51/d0bffb3d2fd82256676d7ad3003faea3bd6dddc9577af665f4\n", - "Successfully built sentence_transformers pypika\n", - "Installing collected packages: sentencepiece, pypika, monotonic, mmh3, websockets, uvloop, urllib3, typing-extensions, python-dotenv, pypdf, pulsar-client, overrides, opentelemetry-util-http, opentelemetry-semantic-conventions, opentelemetry-proto, mypy-extensions, marshmallow, jsonpointer, importlib-metadata, humanfriendly, httptools, h11, geojson, deprecated, chroma-hnswlib, bcrypt, backoff, watchfiles, uvicorn, typing-inspect, starlette, opentelemetry-exporter-otlp-proto-common, opentelemetry-api, jsonpatch, httpcore, crate, coloredlogs, asgiref, tiktoken, posthog, opentelemetry-sdk, opentelemetry-instrumentation, onnxruntime, langsmith, httpx, fastapi, dataclasses-json, opentelemetry-instrumentation-asgi, opentelemetry-exporter-otlp-proto-grpc, openai, langchain-core, kubernetes, opentelemetry-instrumentation-fastapi, langchain-community, sentence_transformers, langchain, chromadb\n", - " Attempting uninstall: urllib3\n", - " Found existing installation: urllib3 2.0.7\n", - " Uninstalling urllib3-2.0.7:\n", - " Successfully uninstalled urllib3-2.0.7\n", - " Attempting uninstall: typing-extensions\n", - " Found existing installation: typing_extensions 4.5.0\n", - " Uninstalling typing_extensions-4.5.0:\n", - " Successfully uninstalled typing_extensions-4.5.0\n", - " Attempting uninstall: importlib-metadata\n", - " Found existing installation: importlib-metadata 7.0.0\n", - " Uninstalling importlib-metadata-7.0.0:\n", - " Successfully uninstalled importlib-metadata-7.0.0\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "lida 0.0.10 requires kaleido, which is not installed.\n", - "lida 0.0.10 requires python-multipart, which is not installed.\n", - "llmx 0.0.15a0 requires cohere, which is not installed.\n", - "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed asgiref-3.7.2 backoff-2.2.1 bcrypt-4.1.2 chroma-hnswlib-0.7.3 chromadb-0.4.21 coloredlogs-15.0.1 crate-0.34.0 dataclasses-json-0.6.3 deprecated-1.2.14 fastapi-0.105.0 geojson-3.1.0 h11-0.14.0 httpcore-1.0.2 httptools-0.6.1 httpx-0.26.0 humanfriendly-10.0 importlib-metadata-6.11.0 jsonpatch-1.33 jsonpointer-2.4 kubernetes-28.1.0 langchain-0.0.352 langchain-community-0.0.5 langchain-core-0.1.2 langsmith-0.0.72 marshmallow-3.20.1 mmh3-4.0.1 monotonic-1.6 mypy-extensions-1.0.0 onnxruntime-1.16.3 openai-1.6.0 opentelemetry-api-1.22.0 opentelemetry-exporter-otlp-proto-common-1.22.0 opentelemetry-exporter-otlp-proto-grpc-1.22.0 opentelemetry-instrumentation-0.43b0 opentelemetry-instrumentation-asgi-0.43b0 opentelemetry-instrumentation-fastapi-0.43b0 opentelemetry-proto-1.22.0 opentelemetry-sdk-1.22.0 opentelemetry-semantic-conventions-0.43b0 opentelemetry-util-http-0.43b0 overrides-7.4.0 posthog-3.1.0 pulsar-client-3.3.0 pypdf-3.17.3 pypika-0.48.9 python-dotenv-1.0.0 sentence_transformers-2.2.2 sentencepiece-0.1.99 starlette-0.27.0 tiktoken-0.5.2 typing-extensions-4.9.0 typing-inspect-0.9.0 urllib3-1.26.18 uvicorn-0.25.0 uvloop-0.19.0 watchfiles-0.21.0 websockets-12.0\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "from langchain.document_loaders.csv_loader import CSVLoader\n", - "from langchain.vectorstores import Chroma\n", - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "from langchain.chains import RetrievalQA, ConversationalRetrievalChain\n", - "from langchain.llms import OpenAI\n", - "from langchain.embeddings.openai import OpenAIEmbeddings\n", - "import pandas as pd\n", - "import sqlalchemy as sa\n", - "from sqlalchemy import create_engine\n", - "from sqlalchemy import text\n", - "import crate\n", - "import openai\n", - "import os\n", - "import getpass" - ], - "metadata": { - "id": "VUNjBDrXNNoG" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Create embeddings from dataset\n" - ], - "metadata": { - "id": "Cd2BLNlReU01" - } - }, - { - "cell_type": "code", - "source": [ - "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Open AI API Key:\")\n", - "openai.api_key = os.environ[\"OPENAI_API_KEY\"]" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LsRfzgmeNjJc", - "outputId": "6fdff9a4-8007-4773-818d-2cf2fcce28b9" - }, - "execution_count": null, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Open AI API Key:··········\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "loader = CSVLoader(file_path=\"./sample_data/twitter_support_microsoft.csv\", encoding=\"utf-8\", csv_args={'delimiter': ','})\n", - "data = loader.load()\n", - "pages_text = [doc.page_content for doc in data]\n", - "print(pages_text[0])" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6Po5rpReNuhn", - "outputId": "84e363de-84be-4c96-d3b7-8c4561fd03db" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "tweet_id: 2301\n", - "author_id: 116231\n", - "inbound: True\n", - "created_at: Tue Oct 31 20:22:23 +0000 2017\n", - "text: @MicrosoftHelps Please get back to me immediately this is of the upmost importance\n", - "response_tweet_id: 2299\n", - "in_response_to_tweet_id: 2306\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "embeddings = OpenAIEmbeddings(deployment='my-embedding-model', chunk_size=1)\n", - "pages_embeddings = embeddings.embed_documents(pages_text)" - ], - "metadata": { - "id": "nWl5RSPjPgGv" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#Write data to CrateDB\n", - "\n", - "The next step creates a dataframe that contains the text of the documents and their embeddings. The embeddings will be stored in CrateDB using FLOAT_VECTOR type." - ], - "metadata": { - "id": "QhOU-4aXQkTX" - } - }, - { - "cell_type": "code", - "source": [ - "df = pd.DataFrame(list(zip(pages_text, pages_embeddings)),columns =['text', 'embedding'])" - ], - "metadata": { - "id": "r_I0dlUNQgKU" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "host = getpass.getpass(\"Host:\")\n", - "password = getpass.getpass(\"password:\")\n", - "dbname=\"crate://admin:{0}@{1}:4200?ssl=true\".format(password,host)\n", - "create_table = text(\"CREATE TABLE text_data (text TEXT, embedding FLOAT_VECTOR(1536))\")\n", - "engine = create_engine(dbname, echo=False)\n", - "\n", - "with engine.connect() as con:\n", - " con.execute(create_table)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "vzsx_YdaV2et", - "outputId": "221fe177-212e-4e74-9a26-e4d48d6eae3d" - }, - "execution_count": null, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Host:··········\n", - "password:··········\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "The text and embeddings are written to CrateDB database using CrateDB vector storage support:" - ], - "metadata": { - "id": "r5MDKdW5Y_Um" - } - }, - { - "cell_type": "code", - "source": [ - "df.to_sql(name='text_data', con=engine, if_exists='append', index=False)\n", - "df.head(5)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - }, - "id": "8pzKlr3uV3Ql", - "outputId": "93566fa0-7ef4-44fd-e29e-23e7b29c645f" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " text \\\n", - "0 tweet_id: 2301\\nauthor_id: 116231\\ninbound: Tr... \n", - "1 tweet_id: 11879\\nauthor_id: MicrosoftHelps\\nin... \n", - "2 tweet_id: 11881\\nauthor_id: MicrosoftHelps\\nin... \n", - "3 tweet_id: 11890\\nauthor_id: 118332\\ninbound: T... \n", - "4 tweet_id: 11912\\nauthor_id: MicrosoftHelps\\nin... \n", - "\n", - " embedding \n", - "0 [-0.037185399571588756, -0.01364005917049614, ... \n", - "1 [-0.015454164058839018, 0.0032340502581370413,... \n", - "2 [-0.005936504790842904, 0.01942733669848253, 0... \n", - "3 [-0.011779013479771422, 0.005725434705161641, ... \n", - "4 [-0.022950152341946847, 0.004767860370434739, ... " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
textembedding
0tweet_id: 2301\\nauthor_id: 116231\\ninbound: Tr...[-0.037185399571588756, -0.01364005917049614, ...
1tweet_id: 11879\\nauthor_id: MicrosoftHelps\\nin...[-0.015454164058839018, 0.0032340502581370413,...
2tweet_id: 11881\\nauthor_id: MicrosoftHelps\\nin...[-0.005936504790842904, 0.01942733669848253, 0...
3tweet_id: 11890\\nauthor_id: 118332\\ninbound: T...[-0.011779013479771422, 0.005725434705161641, ...
4tweet_id: 11912\\nauthor_id: MicrosoftHelps\\nin...[-0.022950152341946847, 0.004767860370434739, ...
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 10 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "#Ask question\n", - "Let's define our question and create an embedding using OpenAI embedding model:" - ], - "metadata": { - "id": "GkAPZ55RZQ09" - } - }, - { - "cell_type": "code", - "source": [ - "my_question = \"How to update shipping address on existing order in Microsoft Store?\"\n", - "query_embedding = embeddings.embed_query(my_question)" - ], - "metadata": { - "id": "InhR73isZJCB" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#Find relevant context using similarity search\n", - "\n", - "The `knn_match (search_vector, query_vector, k) `function in CrateDB performs an approximate k-nearest neighbors (KNN) search within a dataset. KNN search involves finding the k data points that are most similar to a given query data point. We find the most similar vectors to our query vector using knn search capability in CrateDB:" - ], - "metadata": { - "id": "6XnNZHI6ajaS" - } + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "import openai\n", + "import pandas as pd\n", + "import sqlalchemy as sa\n", + "\n", + "from langchain.document_loaders.csv_loader import CSVLoader\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from pueblo.util.environ import getenvpass" + ], + "metadata": { + "id": "VUNjBDrXNNoG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Configure database settings" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "CONNECTION_STRING = os.environ.get(\n", + " \"CRATEDB_CONNECTION_STRING\",\n", + " \"crate://crate@localhost/\",\n", + ")\n", + "\n", + "# For CrateDB Cloud, use:\n", + "# CONNECTION_STRING = os.environ.get(\n", + "# \"CRATEDB_CONNECTION_STRING\",\n", + "# \"crate://username:password@hostname/?ssl=true\",\n", + "# )" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Configure OpenAI" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "getenvpass(\"OPENAI_API_KEY\", prompt=\"OpenAI API key:\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Patches\n", + "Those can be removed again after they have been upstreamed." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# TODO: Bring this into the `crate-python` driver.\n", + "from cratedb_toolkit.sqlalchemy.patch import patch_inspector\n", + "patch_inspector()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Create embeddings from dataset" + ], + "metadata": { + "id": "Cd2BLNlReU01" + } + }, + { + "cell_type": "code", + "source": [ + "loader = CSVLoader(file_path=\"./sample_data/twitter_support_microsoft.csv\", encoding=\"utf-8\", csv_args={'delimiter': ','})\n", + "data = loader.load()\n", + "pages_text = [doc.page_content for doc in data]\n", + "print(pages_text[0])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "source": [ - "knn_query = text(\"\"\"SELECT text FROM text_data\n", - " WHERE knn_match(embedding, {0}, 2)\"\"\".format(query_embedding))\n", - "documents=[]\n", - "\n", - "with engine.connect() as con:\n", - " results = con.execute(knn_query)\n", - " for record in results:\n", - " documents.append(record[0])\n", - "\n", - "print(documents)\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VjLeMkwMagOf", - "outputId": "2c92d6fc-22aa-4914-b58c-bbd3928108e1" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "['tweet_id: 12858\\nauthor_id: 118603\\ninbound: True\\ncreated_at: Mon Oct 30 18:33:00 +0000 2017\\ntext: @MicrosoftHelps The store never gave me an error message. It\\'s STILL sitting there \"thinking/working\" after 2 hours.\\nresponse_tweet_id: 12857,12859\\nin_response_to_tweet_id: 12860', 'tweet_id: 12881\\nauthor_id: 118606\\ninbound: True\\ncreated_at: Wed Nov 01 12:18:10 +0000 2017\\ntext: @MicrosoftHelps okay. let me contact them\\nresponse_tweet_id: \\nin_response_to_tweet_id: 12879', \"tweet_id: 12868\\nauthor_id: MicrosoftHelps\\ninbound: False\\ncreated_at: Tue Oct 31 13:23:00 +0000 2017\\ntext: @118604 1/2 We don't have direct email. You can post your query via Community Forum for assistance: https://t.co/jsa5yeYZ1T.\\nresponse_tweet_id: \\nin_response_to_tweet_id: 12867\", \"tweet_id: 11883\\nauthor_id: MicrosoftHelps\\ninbound: False\\ncreated_at: Thu Oct 26 16:30:01 +0000 2017\\ntext: @118331 Hi. That's strange. Let's make sure that all your driver was updated. Here's how: https://t.co/paTrSXK1Xn. Update us.\\nresponse_tweet_id: 11882\\nin_response_to_tweet_id: 11884\", 'tweet_id: 9692\\nauthor_id: MicrosoftHelps\\ninbound: False\\ncreated_at: Fri Oct 20 23:34:51 +0000 2017\\ntext: @117762 Hellos James. You will need to cancel your current order and place a new one so you can include your updated details.\\nresponse_tweet_id: \\nin_response_to_tweet_id: 9691', \"tweet_id: 9685\\nauthor_id: 117762\\ninbound: True\\ncreated_at: Thu Oct 26 17:54:08 +0000 2017\\ntext: @MicrosoftHelps Seems to be good. Support responded by email saying that the order status won't change online, but the warehouse will ship to the new addr.\\nresponse_tweet_id: 9684,9686\\nin_response_to_tweet_id: 9687\", 'tweet_id: 11894\\nauthor_id: MicrosoftHelps\\ninbound: False\\ncreated_at: Fri Oct 27 19:42:25 +0000 2017\\ntext: @118333 2/2 Store app or via Microsoft Store online?\\nresponse_tweet_id: 11892\\nin_response_to_tweet_id: 11895', \"tweet_id: 9688\\nauthor_id: 117762\\ninbound: True\\ncreated_at: Wed Oct 25 01:36:41 +0000 2017\\ntext: @MicrosoftHelps I was helped when I called support again. However, the address hasn't changed in the order status yet.\\nresponse_tweet_id: 9689,9687\\nin_response_to_tweet_id: 9690\"]\n" - ] - } - ] + "id": "6Po5rpReNuhn", + "outputId": "84e363de-84be-4c96-d3b7-8c4561fd03db" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "embeddings = OpenAIEmbeddings(deployment='my-embedding-model', chunk_size=1)\n", + "pages_embeddings = embeddings.embed_documents(pages_text)" + ], + "metadata": { + "id": "nWl5RSPjPgGv" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Write data to CrateDB\n", + "\n", + "The next step creates a dataframe that contains the text of the documents and their embeddings. The embeddings will be stored in CrateDB using FLOAT_VECTOR type." + ], + "metadata": { + "id": "QhOU-4aXQkTX" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.DataFrame(list(zip(pages_text, pages_embeddings)), columns=['text', 'embedding'])" + ], + "metadata": { + "id": "r_I0dlUNQgKU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "engine = sa.create_engine(CONNECTION_STRING, echo=False)\n", + "\n", + "create_table = sa.text(\"CREATE TABLE IF NOT EXISTS text_data (text TEXT, embedding FLOAT_VECTOR(1536))\")\n", + "with engine.connect() as con:\n", + " con.execute(create_table)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "source": [ - "#Augment system prompt and query LLM" - ], - "metadata": { - "id": "-j94BF-3e1Je" - } + "id": "vzsx_YdaV2et", + "outputId": "221fe177-212e-4e74-9a26-e4d48d6eae3d" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The text and embeddings are written to CrateDB database using CrateDB vector storage support:" + ], + "metadata": { + "id": "r5MDKdW5Y_Um" + } + }, + { + "cell_type": "code", + "source": [ + "df.to_sql(name='text_data', con=engine, if_exists='append', index=False)\n", + "df.head(5)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, - { - "cell_type": "code", - "source": [ - "context = '---\\n'.join(documents)\n", - "\n", - "system_prompt = f\"\"\"\n", - "You are customer support expert and get questions about Microsoft products and services.\n", - "To answer question use the information from the context. Remove new line characters from the answer.\n", - "If you don't find the relevant information there, say \"I don't know\".\n", - "\n", - "Context:\n", - "{context}\"\"\"\n", - "\n", - "chat_completion = openai.chat.completions.create(model=\"gpt-4\",\n", - " messages=[{\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": my_question}])\n" - ], - "metadata": { - "id": "IEuq9r2EaqUz" - }, - "execution_count": null, - "outputs": [] + "id": "8pzKlr3uV3Ql", + "outputId": "93566fa0-7ef4-44fd-e29e-23e7b29c645f" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Ask question\n", + "Let's define our question and create an embedding using OpenAI embedding model:" + ], + "metadata": { + "id": "GkAPZ55RZQ09" + } + }, + { + "cell_type": "code", + "source": [ + "my_question = \"How to update shipping address on existing order in Microsoft Store?\"\n", + "query_embedding = embeddings.embed_query(my_question)" + ], + "metadata": { + "id": "InhR73isZJCB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Find relevant context using similarity search\n", + "\n", + "The `knn_match (search_vector, query_vector, k) `function in CrateDB performs an approximate k-nearest neighbors (KNN) search within a dataset. KNN search involves finding the k data points that are most similar to a given query data point. We find the most similar vectors to our query vector using knn search capability in CrateDB:" + ], + "metadata": { + "id": "6XnNZHI6ajaS" + } + }, + { + "cell_type": "code", + "source": [ + "knn_query = sa.text(\"\"\"SELECT text FROM text_data\n", + " WHERE knn_match(embedding, {0}, 2)\"\"\".format(query_embedding))\n", + "documents=[]\n", + "\n", + "with engine.connect() as con:\n", + " results = con.execute(knn_query)\n", + " for record in results:\n", + " documents.append(record[0])\n", + "\n", + "print(documents)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "source": [ - "chat_completion.choices[0].message.content" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 53 - }, - "id": "aQnmpCIZa13L", - "outputId": "a10c71a7-a6c7-4f83-c069-df49703d4654" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'To update the shipping address on an existing order in the Microsoft Store, you will need to cancel your current order and place a new one so you can include your updated details.'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } - }, - "metadata": {}, - "execution_count": 14 - } - ] + "id": "VjLeMkwMagOf", + "outputId": "2c92d6fc-22aa-4914-b58c-bbd3928108e1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Augment system prompt and query LLM" + ], + "metadata": { + "id": "-j94BF-3e1Je" + } + }, + { + "cell_type": "code", + "source": [ + "context = '---\\n'.join(documents)\n", + "\n", + "system_prompt = f\"\"\"\n", + "You are customer support expert and get questions about Microsoft products and services.\n", + "To answer question use the information from the context. Remove new line characters from the answer.\n", + "If you don't find the relevant information there, say \"I don't know\".\n", + "\n", + "Context:\n", + "{context}\"\"\"\n", + "\n", + "chat_completion = openai.chat.completions.create(model=\"gpt-4\",\n", + " messages=[{\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": my_question}])\n" + ], + "metadata": { + "id": "IEuq9r2EaqUz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "chat_completion.choices[0].message.content" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "iTGxziOBvK25" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file + "id": "aQnmpCIZa13L", + "outputId": "a10c71a7-a6c7-4f83-c069-df49703d4654" + }, + "execution_count": null, + "outputs": [] + } + ] +} diff --git a/topic/machine-learning/llm-langchain/pyproject.toml b/topic/machine-learning/llm-langchain/pyproject.toml index ac48fed6..5f2bc638 100644 --- a/topic/machine-learning/llm-langchain/pyproject.toml +++ b/topic/machine-learning/llm-langchain/pyproject.toml @@ -36,6 +36,9 @@ nb_diff_ignore = [ "/cells/13/outputs", "/cells/15/outputs", "/cells/17/outputs", + + # cratedb_rag_customer_support.ipynb + "/cells/*/outputs", ] [tool.coverage.run]