diff --git a/notebooks/sparsify-bge-small/Sparsifying_BGE_Small.ipynb b/notebooks/sparsify-bge-small/Sparsifying_BGE_Small.ipynb index 63330f3..a7e9e35 100644 --- a/notebooks/sparsify-bge-small/Sparsifying_BGE_Small.ipynb +++ b/notebooks/sparsify-bge-small/Sparsifying_BGE_Small.ipynb @@ -6,9 +6,9 @@ "id": "Xe65aA4nWBqQ" }, "source": [ - "# Sparsifying the BGE-Small for Embeddings\n", + "# Sparsifying the BGE-Small Model for Embeddings\n", "\n", - "BGE models are currently state-of-the-art models for embeddings on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard). In this notebook, we will sparsify the [bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model using [Sparsify's](https://github.com/neuralmagic/sparsify) INT8 quantization and unstructured pruning via its One-Shot method. We will also evaluate its accuracy and speed improvements vs. its dense variant after sparsification.To learn more about One-Shot, refer to this [guide](https://github.com/neuralmagic/sparsify/blob/main/docs/one-shot-experiment-guide.md)." + "BGE models are currently state-of-the-art models for embeddings on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard). In this notebook, we will sparsify the [bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model using [Sparsify's](https://github.com/neuralmagic/sparsify) INT8 quantization via its one-shot method. We will also evaluate its accuracy and speed improvements vs. its dense variant after sparsification.To learn more about one-shot, refer to this [guide](https://github.com/neuralmagic/sparsify/blob/main/docs/one-shot-experiment-guide.md)." ] }, { @@ -21,87 +21,9 @@ "id": "0tMHBYNrx1TI", "outputId": "3e65fe50-04d0-4cd1-f0c8-5ffcb662d9eb" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.3/46.3 MB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.0/301.0 kB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m71.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m107.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.9/294.9 kB\u001b[0m \u001b[31m30.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.6/519.6 kB\u001b[0m \u001b[31m48.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m93.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m39.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m67.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.6/143.6 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m60.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m24.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for optimum-deepsparse (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for diffusers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for optimum (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.5/97.5 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m404.2/404.2 kB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m153.4/153.4 MB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m978.3/978.3 kB\u001b[0m \u001b[31m60.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m226.0/226.0 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m952.4/952.4 kB\u001b[0m \u001b[31m67.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m619.9/619.9 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m53.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.9/60.9 MB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m59.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.7/78.7 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.9/121.9 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.0/21.0 MB\u001b[0m \u001b[31m61.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m849.3/849.3 kB\u001b[0m \u001b[31m67.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.8/11.8 MB\u001b[0m \u001b[31m74.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m557.1/557.1 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.1/317.1 MB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m168.4/168.4 MB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.6/54.6 MB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.6/102.6 MB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m173.2/173.2 MB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.1/177.1 MB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.6/98.6 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m60.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.4/93.4 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for sparsify-nightly (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for GPUtil (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "arviz 0.15.1 requires setuptools>=60.0.0, but you have setuptools 59.5.0 which is incompatible.\n", - "cvxpy 1.3.2 requires setuptools>65.5.1, but you have setuptools 59.5.0 which is incompatible.\n", - "torchaudio 2.0.2+cu118 requires torch==2.0.1, but you have torch 2.0.0 which is incompatible.\n", - "torchdata 0.6.1 requires torch==2.0.1, but you have torch 2.0.0 which is incompatible.\n", - "torchtext 0.15.2 requires torch==2.0.1, but you have torch 2.0.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.0/86.0 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], + "outputs": [], "source": [ - "!pip install git+https://github.com/neuralmagic/optimum-deepsparse.git -q\n", + "!pip install -U deepsparse-nightly[sentence_transformers] -q\n", "!pip install git+https://github.com/neuralmagic/sparsify.git -q\n", "!pip install sentence-transformers evaluate -q" ] @@ -197,101 +119,11 @@ "outputId": "f8217276-5307-459b-c70d-5c69fd483085" }, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e1488a555e204c88bbf68364aea3275c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)lve/main/config.json: 0%| | 0.00/743 [00:00WARN< is_supported_graph src/onnxruntime_neuralmagic/supported/ops.cc:150] Warning: Optimized runtime disabled - Detected dynamic input input_ids dim 0. Set inputs to static shapes to enable optimal performance.\n" ] }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bcc92fdee48245bfba23d24fb628789c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/1 [00:00WARN< is_supported_graph src/onnxruntime_neuralmagic/supported/ops.cc:150] Warning: Optimized runtime disabled - Detected dynamic input input_ids dim 0. Set inputs to static shapes to enable optimal performance.\n" + ] + }, + { + "data": { + "text/html": [ + "
───────────────────────────────────────────────── Selected tasks  ─────────────────────────────────────────────────\n",
+              "
\n" + ], + "text/plain": [ + "\u001b[38;5;235m───────────────────────────────────────────────── \u001b[0m\u001b[1mSelected tasks \u001b[0m\u001b[38;5;235m ─────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
STS\n",
+              "
\n" + ], + "text/plain": [ + "\u001b[1mSTS\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
    - STSBenchmark, s2s\n",
+              "
\n" + ], + "text/plain": [ + " - STSBenchmark, \u001b[3;38;5;241ms2s\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n",
+              "\n",
+              "
\n" + ], + "text/plain": [ + "\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b1a72db93db14ad0882cf299c137b957", + "model_id": "2df95ea0a5e84907a3a852cfb6b2b755", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Downloading builder script: 0%| | 0.00/5.75k [00:00 of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n", - "WARNING:datasets.fingerprint:Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3eb8f278cae84543833ad78a306e9601", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/24 [00:00───────────────────────────────────────────────── Selected tasks ─────────────────────────────────────────────────\n", + "\n" + ], + "text/plain": [ + "\u001b[38;5;235m───────────────────────────────────────────────── \u001b[0m\u001b[1mSelected tasks \u001b[0m\u001b[38;5;235m ─────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
STS\n",
+              "
\n" + ], + "text/plain": [ + "\u001b[1mSTS\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
    - STSBenchmark, s2s\n",
+              "
\n" + ], + "text/plain": [ + " - STSBenchmark, \u001b[3;38;5;241ms2s\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n",
+              "\n",
+              "
\n" + ], + "text/plain": [ + "\n", + "\n" ] }, "metadata": {}, @@ -2571,45 +2317,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "dense model: pearson=0.8913432543187466%\n", - "sparse model: pearson=0.8563085613094055%\n", - "The sparse model achieves 96.00% accuracy of the dense model\n" + "{'STSBenchmark': {'mteb_version': '1.1.1', 'dataset_revision': 'b0fddb56ed78048fa8b90373c8a3cfc37b684831', 'mteb_dataset_name': 'STSBenchmark', 'validation': {'cos_sim': {'pearson': 0.8828211766495108, 'spearman': 0.8892465763120051}, 'manhattan': {'pearson': 0.886201824808084, 'spearman': 0.8907627276162985}, 'euclidean': {'pearson': 0.8868149931196716, 'spearman': 0.8913096186609996}, 'evaluation_time': 4.43}, 'test': {'cos_sim': {'pearson': 0.8431285123201885, 'spearman': 0.8586295017067542}, 'manhattan': {'pearson': 0.854393933014824, 'spearman': 0.8591549232752812}, 'euclidean': {'pearson': 0.8565471782504085, 'spearman': 0.8612847755343875}, 'evaluation_time': 1.19}}}\n" ] } ], "source": [ - "from datasets import load_dataset\n", - "from evaluate import load\n", - "import torch\n", - "\n", - "eval_dataset = load_dataset(\"glue\",\"stsb\",split=\"validation\")\n", - "metric = load('glue', 'stsb')\n", - "\n", - "def compute_sentence_similarity(sentence_1, sentence_2, pipeline):\n", - " embedding_1 = pipeline(sentence_1)\n", - " embedding_2 = pipeline(sentence_2)\n", + "from mteb import MTEB\n", "\n", - " return torch.nn.functional.cosine_similarity(embedding_1, embedding_2, dim=1)\n", + "# Specify the model to use\n", + "quant = \"bge-small-en-v1.5-quant\"\n", + "dense = \"BAAI/bge-small-en-v1.5\"\n", "\n", - "def evaluate_stsb(example):\n", - " default = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], dense_pipe)\n", - " sparse = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], sparse_pipe)\n", - " return {\n", - " 'reference': (example[\"label\"] - 1) / (5 - 1),\n", - " 'default': float(default),\n", - " 'sparse': float(sparse),\n", - " }\n", + "# DeepSparse Model Evaluation\n", + "from deepsparse.sentence_transformers import DeepSparseSentenceTransformer\n", + "model = DeepSparseSentenceTransformer(quant, export=False)\n", + "evaluation = MTEB(tasks=[\"STSBenchmark\"])\n", + "results_ds = evaluation.run(model, output_folder=f\"results/ds-{quant}\")\n", + "print(results_ds)\n", "\n", - "# run evaluation\n", - "result = eval_dataset.map(evaluate_stsb)\n", - "\n", - "# compute metrics\n", - "default_acc = metric.compute(predictions=result[\"default\"], references=result[\"reference\"])\n", - "sparse = metric.compute(predictions=result[\"sparse\"], references=result[\"reference\"])\n", - "\n", - "print(f\"dense model: pearson={default_acc['pearson']}%\")\n", - "print(f\"sparse model: pearson={sparse['pearson']}%\")\n", - "print(f\"The sparse model achieves {round(sparse['pearson']/default_acc['pearson'],2)*100:.2f}% accuracy of the dense model\")" + "# Original SentenceTransformers Model Evaluation\n", + "import sentence_transformers\n", + "model = sentence_transformers.SentenceTransformer(dense)\n", + "evaluation = MTEB(tasks=[\"STSBenchmark\"])\n", + "results_st = evaluation.run(model, output_folder=f\"results/st-{dense}\")\n", + "print(results_st)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The quantized model achieves 99.9% recovery when compared to the dense model on MTEB's `cos_sim` `spearman` metric." ] }, { @@ -2618,93 +2356,102 @@ "id": "5MMJ8PG-eNJO" }, "source": [ - "# Benchmark the Dense PyTorch vs. Sparse ONNX Model for Latency" + "# Benchmark the Dense PyTorch vs. Quantized ONNX Model for Latency\n", + "\n", + "In addition to the MTEB benchmarking, the integration includes a custom script for benchmarking latency and throughput, let's test how the dense vs. quantized model perform against each other. First, git clone deepsparse:" ] }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'deepsparse'...\n" + ] }, - "id": "KxSXJJmceYiQ", - "outputId": "96dfe7c5-35ed-4a0b-8abd-3c4df7c7e5a8" - }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "remote: Enumerating objects: 18974, done.\u001b[K\n", + "remote: Counting objects: 100% (5600/5600), done.\u001b[K\n", + "remote: Compressing objects: 100% (1547/1547), done.\u001b[K\n", + "remote: Total 18974 (delta 4935), reused 4451 (delta 4037), pack-reused 13374\u001b[K\n", + "Receiving objects: 100% (18974/18974), 139.80 MiB | 31.52 MiB/s, done.\n", + "Resolving deltas: 100% (13356/13356), done.\n" + ] + } + ], + "source": [ + "!git clone https://github.com/neuralmagic/deepsparse.git" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, run this CLI command to benchmmark the models' latency on encoding 100 sentences on a max sequence length=512 and batch size=1:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Payload sequence length: 367\n", - "dense model latency: P95 latency (ms) - 810.6698678000611; Average latency (ms) - 359.64 +\\- 171.19;\n", - "sparse model latency: P95 latency (ms) - 375.64537654984633; Average latency (ms) - 321.86 +\\- 39.60;\n", - "Latency improvement through one-shot on 2 CPU cores: 2.16x\n" + "2023-11-14 10:13:18 deepsparse.engine WARNING batch_size < 1 so disabling batch size override\n", + "DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20231110 COMMUNITY | (6c521a73) (release) (optimized) (system=avx2_vnni, binary=avx2)\n", + "\u001b[34m[nm_ort 7f4e9f8be000 >WARN< \u001b[33m is_supported_graph src/onnxruntime_neuralmagic/supported/ops.cc:150\u001b[34m] \u001b[0mWarning: Optimized runtime disabled - Detected dynamic input input_ids dim 0. Set inputs to static shapes to enable optimal performance.\n", + "\n", + "[SentenceTransformer]\n", + "Batch size: 1, Sentence length: 700\n", + "Latency: 100 sentences in 23.41 seconds\n", + "Throughput: 4.27 sentences/second\n", + "Batches: 100%|████████████████████████████████| 100/100 [00:07<00:00, 14.13it/s]\n", + "\n", + "[DeepSparse Optimized]\n", + "Batch size: 1, Sentence length: 700\n", + "Latency: 100 sentences in 7.09 seconds\n", + "Throughput: 14.11 sentences/second\n" ] } ], "source": [ - "import subprocess\n", - "from time import perf_counter\n", - "import numpy as np\n", - "\n", - "payload = \"Greetings, I'm Jane the robot, residing in the vibrant city of Seattle, USA. \" \\\n", - " \"My journey involves crafting innovative solutions as a Software Architect, \" \\\n", - " \"driving technological progress through collaborative endeavors and cutting-edge research. \" \\\n", - " \"My experience spans across diverse domains, from optimizing supply chain logistics \" \\\n", - " \"to enhancing medical diagnostics. Passionate about exploring AI ethics and \" \\\n", - " \"the human-machine partnership, I'm constantly evolving to pioneer the future of technology. \" \\\n", - " \"In my spare time, I enjoy exploring the beautiful Pacific Northwest, \" \\\n", - " \"with its majestic mountains and pristine forests. I'm an avid hiker and often find \" \\\n", - " \"myself on the trails, seeking inspiration from nature's wonders. \" \\\n", - " \"When it comes to my work, I believe that artificial intelligence \" \\\n", - " \"has the potential to transform industries and improve people's lives. \" \\\n", - " \"I'm particularly interested in natural language processing and \" \\\n", - " \"machine learning, and I'm dedicated to pushing the boundaries of what AI can achieve. \" \\\n", - " \"In addition to my technical pursuits, I'm also a strong advocate \" \\\n", - " \"for diversity and inclusion in the tech industry. I believe that a diverse \" \\\n", - " \"and inclusive workforce leads to better innovation and more equitable \" \\\n", - " \"technological solutions for society. \" \\\n", - " \"I'm an enthusiastic problem solver and love tackling complex challenges. \" \\\n", - " \"My approach to problem-solving involves a combination of creativity, \" \\\n", - " \"data-driven analysis, and a keen understanding of user needs. \" \\\n", - " \"I'm always eager to collaborate with like-minded individuals \" \\\n", - " \"to bring innovative ideas to life. \" \\\n", - " \"When I'm not working on AI projects or exploring the outdoors, \" \\\n", - " \"I can often be found in the kitchen, experimenting with new recipes \" \\\n", - " \"and cooking up delicious meals for friends and family. \" \\\n", - " \"I believe that the joy of creating extends beyond technology \" \\\n", - " \"and into the realms of culinary art. \" \\\n", - " \"My aspiration is to continue pushing the boundaries \" \\\n", - " \"of what AI can achieve while making a positive impact on society.\"\n", - "\n", - "print(f'Payload sequence length: {len(tokenizer(payload)[\"input_ids\"])}')\n", - "\n", - "def measure_latency(pipe):\n", - " latencies = []\n", - "\n", - " # Timed run\n", - " for _ in range(100):\n", - " start_time = perf_counter()\n", - " _ = pipe(payload)\n", - " latency = perf_counter() - start_time\n", - " latencies.append(latency)\n", - "\n", - " # Compute run statistics\n", - " time_avg_ms = 1000 * np.mean(latencies)\n", - " time_std_ms = 1000 * np.std(latencies)\n", - " time_p95_ms = 1000 * np.percentile(latencies, 95)\n", - " return f\"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\\- {time_std_ms:.2f};\", time_p95_ms\n", - "\n", - "dense_model = measure_latency(dense_pipe)\n", - "quantized_model = measure_latency(sparse_pipe)\n", - "\n", - "# Get the number of CPU cores using the nproc command\n", - "num_cores = int(subprocess.check_output(\"nproc\").decode().strip())\n", - "\n", - "print(f\"dense model latency: {dense_model[0]}\")\n", - "print(f\"sparse model latency: {quantized_model[0]}\")\n", - "print(f\"Latency improvement through one-shot on {num_cores} CPU cores: {round(dense_model[1] / quantized_model[1], 2)}x\")\n" + "!python deepsparse/src/deepsparse/sentence_transformers/benchmark_encoding.py --base_model BAAI/bge-small-en-v1.5 --sparse_model bge-small-en-v1.5-quant" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The quantized BGE model is able to improve latency performance against the dense variant on a 10 core laptop by 3.3X! Furthermore, on optimized hardware, especially avx512 with VNNI instructions, up to 5X improvement can be observed." ] } ], @@ -2719,7 +2466,16 @@ "name": "python3" }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" }, "widgets": { "application/vnd.jupyter.widget-state+json": {