diff --git a/notebooks/sparsify-bge-small/Sparsifying_BGE_Small.ipynb b/notebooks/sparsify-bge-small/Sparsifying_BGE_Small.ipynb index 63330f3..a7e9e35 100644 --- a/notebooks/sparsify-bge-small/Sparsifying_BGE_Small.ipynb +++ b/notebooks/sparsify-bge-small/Sparsifying_BGE_Small.ipynb @@ -6,9 +6,9 @@ "id": "Xe65aA4nWBqQ" }, "source": [ - "# Sparsifying the BGE-Small for Embeddings\n", + "# Sparsifying the BGE-Small Model for Embeddings\n", "\n", - "BGE models are currently state-of-the-art models for embeddings on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard). In this notebook, we will sparsify the [bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model using [Sparsify's](https://github.com/neuralmagic/sparsify) INT8 quantization and unstructured pruning via its One-Shot method. We will also evaluate its accuracy and speed improvements vs. its dense variant after sparsification.To learn more about One-Shot, refer to this [guide](https://github.com/neuralmagic/sparsify/blob/main/docs/one-shot-experiment-guide.md)." + "BGE models are currently state-of-the-art models for embeddings on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard). In this notebook, we will sparsify the [bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model using [Sparsify's](https://github.com/neuralmagic/sparsify) INT8 quantization via its one-shot method. We will also evaluate its accuracy and speed improvements vs. its dense variant after sparsification.To learn more about one-shot, refer to this [guide](https://github.com/neuralmagic/sparsify/blob/main/docs/one-shot-experiment-guide.md)." ] }, { @@ -21,87 +21,9 @@ "id": "0tMHBYNrx1TI", "outputId": "3e65fe50-04d0-4cd1-f0c8-5ffcb662d9eb" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.3/46.3 MB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.0/301.0 kB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m71.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m107.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.9/294.9 kB\u001b[0m \u001b[31m30.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.6/519.6 kB\u001b[0m \u001b[31m48.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m93.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m39.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m67.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.6/143.6 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m60.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m24.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for optimum-deepsparse (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for diffusers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for optimum (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.5/97.5 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m404.2/404.2 kB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m153.4/153.4 MB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m978.3/978.3 kB\u001b[0m \u001b[31m60.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m226.0/226.0 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m952.4/952.4 kB\u001b[0m \u001b[31m67.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m619.9/619.9 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m53.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.9/60.9 MB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m59.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.7/78.7 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.9/121.9 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.0/21.0 MB\u001b[0m \u001b[31m61.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m849.3/849.3 kB\u001b[0m \u001b[31m67.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.8/11.8 MB\u001b[0m \u001b[31m74.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m557.1/557.1 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.1/317.1 MB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m168.4/168.4 MB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.6/54.6 MB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.6/102.6 MB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m173.2/173.2 MB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.1/177.1 MB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.6/98.6 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m60.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.4/93.4 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for sparsify-nightly (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for GPUtil (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "arviz 0.15.1 requires setuptools>=60.0.0, but you have setuptools 59.5.0 which is incompatible.\n", - "cvxpy 1.3.2 requires setuptools>65.5.1, but you have setuptools 59.5.0 which is incompatible.\n", - "torchaudio 2.0.2+cu118 requires torch==2.0.1, but you have torch 2.0.0 which is incompatible.\n", - "torchdata 0.6.1 requires torch==2.0.1, but you have torch 2.0.0 which is incompatible.\n", - "torchtext 0.15.2 requires torch==2.0.1, but you have torch 2.0.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.0/86.0 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], + "outputs": [], "source": [ - "!pip install git+https://github.com/neuralmagic/optimum-deepsparse.git -q\n", + "!pip install -U deepsparse-nightly[sentence_transformers] -q\n", "!pip install git+https://github.com/neuralmagic/sparsify.git -q\n", "!pip install sentence-transformers evaluate -q" ] @@ -197,101 +119,11 @@ "outputId": "f8217276-5307-459b-c70d-5c69fd483085" }, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e1488a555e204c88bbf68364aea3275c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)lve/main/config.json: 0%| | 0.00/743 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Framework not specified. Using pt to export to ONNX.\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c98f9c33b2ae442b8e94d0de701ddeec", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading pytorch_model.bin: 0%| | 0.00/134M [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2ddd92845ba142ecb33d1c0c21570d53", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)okenizer_config.json: 0%| | 0.00/394 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a391a5742057445ca20ef9644693786c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)solve/main/vocab.txt: 0%| | 0.00/232k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "67c4f1b8e6c4447993f0c57daf8ff869", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)/main/tokenizer.json: 0%| | 0.00/711k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f1f80e70459a467bb38318c852f9dfb0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)cial_tokens_map.json: 0%| | 0.00/125 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", "text": [ + "Framework not specified. Using pt to export to ONNX.\n", "Using the export variant default. Available variants are:\n", "\t- default: The default ONNX variant.\n", "Using framework PyTorch: 2.0.0+cu117\n", @@ -312,11 +144,11 @@ { "data": { "text/plain": [ - "('dense-bge-small-en-v1.5/tokenizer_config.json',\n", - " 'dense-bge-small-en-v1.5/special_tokens_map.json',\n", - " 'dense-bge-small-en-v1.5/vocab.txt',\n", - " 'dense-bge-small-en-v1.5/added_tokens.json',\n", - " 'dense-bge-small-en-v1.5/tokenizer.json')" + "('bge-small-en-v1.5-dense/tokenizer_config.json',\n", + " 'bge-small-en-v1.5-dense/special_tokens_map.json',\n", + " 'bge-small-en-v1.5-dense/vocab.txt',\n", + " 'bge-small-en-v1.5-dense/added_tokens.json',\n", + " 'bge-small-en-v1.5-dense/tokenizer.json')" ] }, "execution_count": 2, @@ -336,7 +168,7 @@ "tokenizer = get_preprocessor(model_id)\n", "\n", "# save onnx checkpoint and tokenizer\n", - "onnx_path = Path(f\"dense-bge-small-en-v1.5\")\n", + "onnx_path = Path(\"bge-small-en-v1.5-dense\")\n", "model.save_pretrained(onnx_path)\n", "tokenizer.save_pretrained(onnx_path)" ] @@ -580,1797 +412,1564 @@ }, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4ddf19384864481d95fd820eeefab8c5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading builder script: 0%| | 0.00/28.8k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved 1000 npz files to data/\n" + ] + } + ], + "source": [ + "import os\n", + "import numpy as np\n", + "from datasets import load_dataset\n", + "from sentence_transformers import InputExample\n", + "\n", + "# Load the dataset\n", + "dataset = load_dataset(\"glue\", \"stsb\", split=\"train\")\n", + "\n", + "# Adjusted to get the first 1000 examples\n", + "n_examples = 1000\n", + "\n", + "# Create the \"data\" directory if it doesn't exist\n", + "if not os.path.exists('data'):\n", + " os.makedirs('data')\n", + "\n", + "# Define a function to create NPZ dictionaries\n", + "def create_npz_data(texts, index):\n", + " # Tokenize the texts\n", + " inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')\n", + "\n", + " # Extract input_ids, attention_mask, and token_type_ids\n", + " input_ids = inputs['input_ids'].cpu().numpy()[0]\n", + " attention_mask = inputs['attention_mask'].cpu().numpy()[0]\n", + " token_type_ids = inputs.get('token_type_ids', None)\n", + " if token_type_ids is not None:\n", + " token_type_ids = token_type_ids.cpu().numpy()[0]\n", + "\n", + " # Create the NPZ dictionary\n", + " npz_data = {\n", + " \"input_ids\": input_ids,\n", + " \"attention_mask\": attention_mask,\n", + " \"token_type_ids\": token_type_ids if token_type_ids is not None else np.array([]), # Handle cases where token_type_ids are not present\n", + " }\n", + "\n", + " # Save the dictionary as an NPZ file\n", + " npz_file_path = f'data/input_{index:04d}.npz'\n", + " np.savez(npz_file_path, **npz_data)\n", + "\n", + "# Create NPZ dictionaries and save them individually\n", + "train_examples = []\n", + "for i in range(n_examples):\n", + "\n", + " example = dataset[i]\n", + " train_examples.append(InputExample(texts=[example['sentence1'], example['sentence2']]))\n", + "\n", + " # Extract texts from InputExample instances\n", + " texts = [example.texts for example in train_examples]\n", + "\n", + " # Create the NPZ dictionary and save it\n", + " create_npz_data(texts, i)\n", + "\n", + "print(f'Saved {n_examples} npz files to data/')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S1w1lVKtc2lO" + }, + "source": [ + "# Login to Sparsify" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "biOtjg3xx2lc", + "outputId": "0a5b3a5e-70bb-4c41-a864-4af952d82958" + }, + "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "30ae5ed75a6a41c8b4e61b2025f9dc10", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading metadata: 0%| | 0.00/28.7k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5f56fec2dd2e4cf4a94b46b62e23e5a6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading readme: 0%| | 0.00/27.9k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:sparsify.login:Logging into sparsify...\n", + "INFO:sparsify.utils.helpers:Successfully authenticated with Neural Magic Account API key\n", + "INFO:sparsify.login:sparsifyml version 1.6 is already installed, skipping installation from neuralmagic pypi server\n", + "INFO:sparsify.login:Logged in successfully, sparsify setup is complete.\n" + ] + } + ], + "source": [ + "!sparsify.login EJMfcp88Wpp7efA99WjAfSAFH6jhwfG9" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u5bOOvwFcbg4" + }, + "source": [ + "# Run Sparsify One-Shot\n", + "\n", + "Pass the model directory and set optim-level to 0.2 which initiate quantization with a single CLI command:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "1GwhAgUux2n8", + "outputId": "28fbfff9-2993-41a4-bb1d-2de665f8b9aa" + }, + "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a5054fc2e718474184c80fd552296a3e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading data: 0%| | 0.00/803k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e454922991714ccd8ad4f9f39bb740a5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Generating train split: 0%| | 0/5749 [00:00, ? examples/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2e53170e55f44cb69e002018914a0d76", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Generating validation split: 0%| | 0/1500 [00:00, ? examples/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9c3d4c790bde4760993b724ac6114ab9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Generating test split: 0%| | 0/1379 [00:00, ? examples/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1238d70375b641e288bc82512dde2f46", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)8683f/.gitattributes: 0%| | 0.00/1.52k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "12b4a07b726d4d7b9697981fc2fefd81", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)_Pooling/config.json: 0%| | 0.00/190 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "68290158ed1e4e83bf34f4126f8ef6ac", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)19c878683f/README.md: 0%| | 0.00/89.1k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b614f3722e8149349572f74e81122983", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)c878683f/config.json: 0%| | 0.00/743 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "43d9bc087aa74697a39cabf89fae419d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)ce_transformers.json: 0%| | 0.00/124 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a281a3b81ccb4ef69f38309c11e67b70", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading pytorch_model.bin: 0%| | 0.00/134M [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0e8136fec91b4ec0a451c17decbddd6c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)nce_bert_config.json: 0%| | 0.00/52.0 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f56f25873f304c8bbcdaf261d05a8444", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)cial_tokens_map.json: 0%| | 0.00/125 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "83f63c60e7ce441fa356f62f015bbb0e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)8683f/tokenizer.json: 0%| | 0.00/711k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "73130e02d54d4023b5f03232be318ced", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)okenizer_config.json: 0%| | 0.00/394 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "21c5a1283de8443d92ed3e2db9b2e048", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)19c878683f/vocab.txt: 0%| | 0.00/232k [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9600db20a8704cac8071b8845711bacd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading (…)878683f/modules.json: 0%| | 0.00/229 [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saved 1000 npz files to data/\n" - ] - } - ], - "source": [ - "import os\n", - "import numpy as np\n", - "from datasets import load_dataset\n", - "from transformers import AutoTokenizer\n", - "from sentence_transformers import InputExample\n", - "import torch\n", - "\n", - "# Load the dataset\n", - "dataset = load_dataset(\"glue\", \"stsb\", split=\"train\")\n", - "\n", - "# Adjusted to get the first 1000 examples\n", - "n_examples = 1000\n", - "\n", - "# Create the \"data\" directory if it doesn't exist\n", - "if not os.path.exists('data'):\n", - " os.makedirs('data')\n", - "\n", - "# Load AutoTokenizer from Hugging Face model repository\n", - "model_name = \"BAAI/bge-small-en-v1.5\"\n", - "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", - "\n", - "# Define a function to create NPZ dictionaries\n", - "def create_npz_data(texts, index):\n", - " # Tokenize the texts\n", - " inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt').to(\"cpu\")\n", - "\n", - " # Extract input_ids, attention_mask, and token_type_ids\n", - " input_ids = inputs['input_ids'].cpu().numpy()[0]\n", - " attention_mask = inputs['attention_mask'].cpu().numpy()[0]\n", - " token_type_ids = inputs.get('token_type_ids', None)\n", - " if token_type_ids is not None:\n", - " token_type_ids = token_type_ids.cpu().numpy()[0]\n", - "\n", - " # Create the NPZ dictionary\n", - " npz_data = {\n", - " \"input_ids\": input_ids,\n", - " \"attention_mask\": attention_mask,\n", - " \"token_type_ids\": token_type_ids if token_type_ids is not None else np.array([]), # Handle cases where token_type_ids are not present\n", - " }\n", - "\n", - " # Save the dictionary as an NPZ file\n", - " npz_file_path = f'data/input_{index:04d}.npz'\n", - " np.savez(npz_file_path, **npz_data)\n", - "\n", - "# Create NPZ dictionaries and save them individually\n", - "train_examples = []\n", - "for i in range(n_examples):\n", - "\n", - " example = dataset[i]\n", - " train_examples.append(InputExample(texts=[example['sentence1'], example['sentence2']]))\n", - "\n", - " # Extract texts from InputExample instances\n", - " texts = [example.texts for example in train_examples]\n", - "\n", - " # Create the NPZ dictionary and save it\n", - " create_npz_data(texts, i)\n", - "\n", - "print(f'Saved {n_examples} npz files to data/')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S1w1lVKtc2lO" - }, - "source": [ - "# Login to Sparsify" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "biOtjg3xx2lc", - "outputId": "0a5b3a5e-70bb-4c41-a864-4af952d82958" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:sparsify.login:Logging into sparsify...\n", - "INFO:sparsify.utils.helpers:Successfully authenticated with Neural Magic Account API key\n", - "INFO:sparsify.login:Installing sparsifyml version 1.6 from neuralmagic pypi server\n", - "Looking in indexes: https://nm:****@pypi.neuralmagic.com\n", - "Collecting sparsifyml-nightly~=1.6\n", - " Downloading https://pypi.neuralmagic.com/packages/sparsifyml_nightly-1.6.0.20230921-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (855 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m855.2/855.2 kB\u001b[0m \u001b[31m488.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: sparsifyml-nightly\n", - "Successfully installed sparsifyml-nightly-1.6.0.20230921\n", - "INFO:sparsify.login:Logged in successfully, sparsify setup is complete.\n" - ] - } - ], - "source": [ - "!sparsify.login EJMfcp88Wpp7efA99WjAfSAFH6jhwfG9" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u5bOOvwFcbg4" - }, - "source": [ - "# Run Sparsify One-Shot\n", - "\n", - "Pass the model directory and set optim-level to 0.5 which will set unstructured pruning (sparsity) at 50% and INT8 quantization with a single CLI command." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1GwhAgUux2n8", - "outputId": "28fbfff9-2993-41a4-bb1d-2de665f8b9aa" - }, - "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2023-09-25 13:21:49.585817: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", "WARNING:sparsify.cli.opts:Unknown use-case nlp-embeddings, full feature set may not be availble for custom use cases\n", "INFO:sparsify.utils.helpers:Successfully authenticated with Neural Magic Account API key\n", "INFO:sparsify.login:sparsifyml version 1.6 is already installed, skipping installation from neuralmagic pypi server\n", "WARNING:sparsify.check_environment.gpu_device:Checking for GPU...\n", "WARNING:sparsify.check_environment.gpu_device:GPU check completed successfully\n", "WARNING:sparsify.check_environment.ort_health:Checking onnxruntime-gpu environment health...\n", - "2023-09-25 13:21:54 deepsparse.utils.onnx INFO Generating input 'X', type = float32, shape = [1, 3, 32, 32]\n", + "2023-11-14 09:18:47 deepsparse.utils.onnx INFO Generating input 'X', type = float32, shape = [1, 3, 32, 32]\n", "INFO:deepsparse.utils.onnx:Generating input 'X', type = float32, shape = [1, 3, 32, 32]\n", "WARNING:sparsify.check_environment.ort_health:onnxruntime-gpu environment check completed successfully\n", "INFO:sparsifyml.one_shot.sparsification.obcq.fast_obcq_modifier:Folded 0 Conv-BatchNormalization blocks\n", "INFO:sparsifyml.one_shot.sparsification.obcq.base_obcq_modifier:FastOBCQModifier: starting compression on layers: ['/encoder/layer.0/attention/self/query/MatMul', '/encoder/layer.0/attention/self/value/MatMul', '/encoder/layer.0/attention/output/dense/MatMul', '/encoder/layer.0/intermediate/dense/MatMul', '/encoder/layer.0/output/dense/MatMul', '/encoder/layer.1/attention/self/key/MatMul', '/encoder/layer.1/attention/self/query/MatMul', '/encoder/layer.1/attention/self/value/MatMul', '/encoder/layer.1/attention/output/dense/MatMul', '/encoder/layer.1/intermediate/dense/MatMul', '/encoder/layer.1/output/dense/MatMul', '/encoder/layer.2/attention/self/key/MatMul', '/encoder/layer.2/attention/self/query/MatMul', '/encoder/layer.2/attention/self/value/MatMul', '/encoder/layer.2/attention/output/dense/MatMul', '/encoder/layer.2/intermediate/dense/MatMul', '/encoder/layer.2/output/dense/MatMul', '/encoder/layer.3/attention/self/key/MatMul', '/encoder/layer.3/attention/self/query/MatMul', '/encoder/layer.3/attention/self/value/MatMul', '/encoder/layer.3/attention/output/dense/MatMul', '/encoder/layer.3/intermediate/dense/MatMul', '/encoder/layer.3/output/dense/MatMul', '/encoder/layer.4/attention/self/key/MatMul', '/encoder/layer.4/attention/self/query/MatMul', '/encoder/layer.4/attention/self/value/MatMul', '/encoder/layer.4/attention/output/dense/MatMul', '/encoder/layer.4/intermediate/dense/MatMul', '/encoder/layer.4/output/dense/MatMul', '/encoder/layer.5/attention/self/key/MatMul', '/encoder/layer.5/attention/self/query/MatMul', '/encoder/layer.5/attention/self/value/MatMul', '/encoder/layer.5/attention/output/dense/MatMul', '/encoder/layer.5/intermediate/dense/MatMul', '/encoder/layer.5/output/dense/MatMul', '/encoder/layer.6/attention/self/key/MatMul', '/encoder/layer.6/attention/self/query/MatMul', '/encoder/layer.6/attention/self/value/MatMul', '/encoder/layer.6/attention/output/dense/MatMul', '/encoder/layer.6/intermediate/dense/MatMul', '/encoder/layer.6/output/dense/MatMul', '/encoder/layer.7/attention/self/key/MatMul', '/encoder/layer.7/attention/self/query/MatMul', '/encoder/layer.7/attention/self/value/MatMul', '/encoder/layer.7/attention/output/dense/MatMul', '/encoder/layer.7/intermediate/dense/MatMul', '/encoder/layer.7/output/dense/MatMul', '/encoder/layer.8/attention/self/key/MatMul', '/encoder/layer.8/attention/self/query/MatMul', '/encoder/layer.8/attention/self/value/MatMul', '/encoder/layer.8/attention/output/dense/MatMul', '/encoder/layer.8/intermediate/dense/MatMul', '/encoder/layer.8/output/dense/MatMul', '/encoder/layer.9/attention/self/key/MatMul', '/encoder/layer.9/attention/self/query/MatMul', '/encoder/layer.9/attention/self/value/MatMul', '/encoder/layer.9/attention/output/dense/MatMul', '/encoder/layer.9/intermediate/dense/MatMul', '/encoder/layer.9/output/dense/MatMul', '/encoder/layer.10/attention/self/key/MatMul', '/encoder/layer.10/attention/self/query/MatMul', '/encoder/layer.10/attention/self/value/MatMul', '/encoder/layer.10/attention/output/dense/MatMul', '/encoder/layer.10/intermediate/dense/MatMul', '/encoder/layer.10/output/dense/MatMul', '/encoder/layer.11/attention/self/key/MatMul', '/encoder/layer.11/attention/self/query/MatMul', '/encoder/layer.11/attention/self/value/MatMul', '/encoder/layer.11/attention/output/dense/MatMul', '/encoder/layer.11/intermediate/dense/MatMul']\n", - "Calibration: 1001it [02:00, 8.31it/s]\n", - "INFO:sparsifyml.one_shot.sparsification.obcq.base_obcq_modifier:saving H estimates to experiments/20230925-132203/OBCQ/H_estimates\n", - "WARNING:/usr/local/lib/python3.10/dist-packages/sparsifyml/one_shot/sparsification/obcq/utils/compression.py:H inversion failed with original dampening fraction of 0.001. It eventually succeeded with dampening fraction of 0.01\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.0/attention/self/value/MatMul: 0% 0/24 [00:00, ?it/s]\u001b[A\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 8% 2/24 [00:01<00:17, 1.23it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 12% 3/24 [00:02<00:16, 1.24it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 17% 4/24 [00:03<00:16, 1.24it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 21% 5/24 [00:04<00:15, 1.25it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 25% 6/24 [00:04<00:14, 1.25it/s]\n", - "Compressing /encoder/layer.0/attention/self/value/MatMul: 25% 6/24 [00:04<00:14, 1.26it/s]\u001b[A\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 33% 8/24 [00:06<00:12, 1.24it/s]\n", - "Compressing /encoder/layer.0/attention/self/value/MatMul: 33% 8/24 [00:06<00:12, 1.24it/s]\u001b[A\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 42% 10/24 [00:08<00:11, 1.23it/s]\n", - "Compressing /encoder/layer.0/attention/self/value/MatMul: 42% 10/24 [00:08<00:11, 1.22it/s]\u001b[A\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 50% 12/24 [00:09<00:10, 1.17it/s]\n", - "Compressing /encoder/layer.0/attention/self/value/MatMul: 50% 12/24 [00:09<00:10, 1.17it/s]\u001b[A\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 54% 13/24 [00:10<00:10, 1.09it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 58% 14/24 [00:12<00:09, 1.04it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 62% 15/24 [00:13<00:08, 1.01it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 67% 16/24 [00:14<00:08, 1.01s/it]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 71% 17/24 [00:15<00:07, 1.02s/it]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 75% 18/24 [00:16<00:06, 1.02s/it]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 79% 19/24 [00:16<00:04, 1.05it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 83% 20/24 [00:17<00:03, 1.10it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 88% 21/24 [00:18<00:02, 1.14it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 92% 22/24 [00:19<00:01, 1.18it/s]\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 96% 23/24 [00:20<00:00, 1.20it/s]\n", - "Compressing /encoder/layer.0/attention/self/value/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", + "Calibration: 1001it [01:17, 12.93it/s] \n", + "INFO:sparsifyml.one_shot.sparsification.obcq.base_obcq_modifier:saving H estimates to experiments/20231114-091852/OBCQ/H_estimates\n", + "WARNING:/home/zeroshot/nm/examples/env/lib/python3.10/site-packages/sparsifyml/one_shot/sparsification/obcq/utils/compression.py:H inversion failed with original dampening fraction of 0.001. It eventually succeeded with dampening fraction of 0.01\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 0%| | 0/24 [00:00,\n", + "Compressing /encoder/layer.0/attention/self/value/MatMul: 0%| | 0/24 [00:00,\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 4%| | 1/24 [00:02<00\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 8%| | 2/24 [00:04<00\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 12%|▏| 3/24 [00:06<00\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 17%|▏| 4/24 [00:08<00\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 21%|▏| 5/24 [00:10<00\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 25%|▎| 6/24 [00:13<00\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 29%|▎| 7/24 [00:15<00\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 33%|▎| 8/24 [00:17<00\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 38%|▍| 9/24 [00:19<00\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 42%|▍| 10/24 [00:21<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 46%|▍| 11/24 [00:23<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 50%|▌| 12/24 [00:25<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 54%|▌| 13/24 [00:27<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/value/MatMul: 58%|▌| 14/24 [00:28<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 58%|▌| 14/24 [00:30<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 62%|▋| 15/24 [00:32<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 67%|▋| 16/24 [00:34<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 71%|▋| 17/24 [00:36<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 75%|▊| 18/24 [00:38<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 79%|▊| 19/24 [00:40<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 83%|▊| 20/24 [00:42<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 88%|▉| 21/24 [00:45<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 92%|▉| 22/24 [00:47<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/value/MatMul: 100%|█| 24/24 [00:48<0\u001b[A\n", + "\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 96%|▉| 23/24 [00:49<0\u001b[A\n", + "Compressing /encoder/layer.0/attention/self/query/MatMul: 100%|█| 24/24 [00:51<0\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 0%| | 0/24 [00:00, ?\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 4%| | 1/24 [00:02<00:4\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 8%| | 2/24 [00:04<00:4\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 12%|▏| 3/24 [00:06<00:4\u001b[A\n", + "Compressing /encoder/layer.0/attention/output/dense/MatMul: 21%|▏| 5/24 [00:09<\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 17%|▏| 4/24 [00:08<00:4\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 21%|▏| 5/24 [00:11<00:4\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 25%|▎| 6/24 [00:13<00:4\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 29%|▎| 7/24 [00:15<00:3\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 33%|▎| 8/24 [00:17<00:3\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 38%|▍| 9/24 [00:19<00:3\u001b[A\n", + "Compressing /encoder/layer.0/attention/output/dense/MatMul: 50%|▌| 12/24 [00:23\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 42%|▍| 10/24 [00:22<00:\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 46%|▍| 11/24 [00:24<00:\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 50%|▌| 12/24 [00:27<00:\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 54%|▌| 13/24 [00:29<00:\u001b[A\n", + "Compressing /encoder/layer.0/attention/output/dense/MatMul: 71%|▋| 17/24 [00:33\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 62%|▋| 15/24 [00:33<00:\u001b[A\n", + "Compressing /encoder/layer.0/attention/output/dense/MatMul: 79%|▊| 19/24 [00:37\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 67%|▋| 16/24 [00:35<00:\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 71%|▋| 17/24 [00:38<00:\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 75%|▊| 18/24 [00:40<00:\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 79%|▊| 19/24 [00:42<00:\u001b[A\n", + "Compressing /encoder/layer.0/attention/output/dense/MatMul: 100%|█| 24/24 [00:47\u001b[A\n", + "\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 83%|▊| 20/24 [00:44<00:\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 88%|▉| 21/24 [00:46<00:\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 92%|▉| 22/24 [00:49<00:\u001b[A\n", + "Compressing /encoder/layer.0/output/dense/MatMul: 3%| | 3/96 [00:05<02:54, 1.\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 96%|▉| 23/24 [00:51<00:\u001b[A\n", + "Compressing /encoder/layer.0/intermediate/dense/MatMul: 100%|█| 24/24 [00:53<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 0%| | 0/24 [00:00, ?\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 4%| | 1/24 [00:02<00:5\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 8%| | 2/24 [00:04<00:4\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 12%|▏| 3/24 [00:06<00:4\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 17%|▏| 4/24 [00:08<00:4\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 21%|▏| 5/24 [00:10<00:3\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 25%|▎| 6/24 [00:12<00:3\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 29%|▎| 7/24 [00:13<00:3\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 33%|▎| 8/24 [00:15<00:3\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 42%|▍| 10/24 [00:19<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 46%|▍| 11/24 [00:21<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 50%|▌| 12/24 [00:23<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 54%|▌| 13/24 [00:25<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 58%|▌| 14/24 [00:27<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 62%|▋| 15/24 [00:29<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 67%|▋| 16/24 [00:31<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 71%|▋| 17/24 [00:33<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 75%|▊| 18/24 [00:36<00:\u001b[A\n", + "Compressing /encoder/layer.0/output/dense/MatMul: 24%|▏| 23/96 [00:47<02:32, 2\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 79%|▊| 19/24 [00:38<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 83%|▊| 20/24 [00:41<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 88%|▉| 21/24 [00:43<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 92%|▉| 22/24 [00:46<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 96%|▉| 23/24 [00:48<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/key/MatMul: 100%|█| 24/24 [00:50<00:\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 0%| | 0/24 [00:00,\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 4%| | 1/24 [00:02<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 8%| | 2/24 [00:04<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 12%|▏| 3/24 [00:07<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 17%|▏| 4/24 [00:09<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 21%|▏| 5/24 [00:11<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 25%|▎| 6/24 [00:13<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 29%|▎| 7/24 [00:15<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 33%|▎| 8/24 [00:17<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 38%|▍| 9/24 [00:19<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 42%|▍| 10/24 [00:21<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 46%|▍| 11/24 [00:24<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 50%|▌| 12/24 [00:26<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 54%|▌| 13/24 [00:28<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 58%|▌| 14/24 [00:30<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 62%|▋| 15/24 [00:32<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 67%|▋| 16/24 [00:34<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 71%|▋| 17/24 [00:36<0\u001b[A\n", + "Compressing /encoder/layer.0/output/dense/MatMul: 49%|▍| 47/96 [01:38<01:40, 2\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 75%|▊| 18/24 [00:38<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 79%|▊| 19/24 [00:40<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 83%|▊| 20/24 [00:43<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 88%|▉| 21/24 [00:44<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 96%|▉| 23/24 [00:48<0\u001b[A\n", + "Compressing /encoder/layer.0/output/dense/MatMul: 55%|▌| 53/96 [01:50<01:30, 2\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/query/MatMul: 100%|█| 24/24 [00:51<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 0%| | 0/24 [00:00,\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 4%| | 1/24 [00:02<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 8%| | 2/24 [00:04<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 12%|▏| 3/24 [00:06<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 17%|▏| 4/24 [00:08<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 21%|▏| 5/24 [00:10<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 25%|▎| 6/24 [00:12<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 29%|▎| 7/24 [00:14<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 33%|▎| 8/24 [00:16<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 38%|▍| 9/24 [00:18<00\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 42%|▍| 10/24 [00:20<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 46%|▍| 11/24 [00:22<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 50%|▌| 12/24 [00:24<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 54%|▌| 13/24 [00:26<0\u001b[A\n", + "Compressing /encoder/layer.0/output/dense/MatMul: 71%|▋| 68/96 [02:20<00:58, 2\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 58%|▌| 14/24 [00:29<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 67%|▋| 16/24 [00:33<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 71%|▋| 17/24 [00:35<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 75%|▊| 18/24 [00:37<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 79%|▊| 19/24 [00:39<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 83%|▊| 20/24 [00:41<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 88%|▉| 21/24 [00:43<0\u001b[A\n", + "Compressing /encoder/layer.0/output/dense/MatMul: 79%|▊| 76/96 [02:36<00:40, 2\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 92%|▉| 22/24 [00:46<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 96%|▉| 23/24 [00:48<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/self/value/MatMul: 100%|█| 24/24 [00:50<0\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 0%| | 0/24 [00:00<\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 4%| | 1/24 [00:01<\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 8%| | 2/24 [00:03<\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 12%|▏| 3/24 [00:05<\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 17%|▏| 4/24 [00:08<\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 21%|▏| 5/24 [00:10<\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 25%|▎| 6/24 [00:12<\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 29%|▎| 7/24 [00:14<\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 33%|▎| 8/24 [00:16<\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 38%|▍| 9/24 [00:18<\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 42%|▍| 10/24 [00:20\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 46%|▍| 11/24 [00:22\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 50%|▌| 12/24 [00:24\u001b[A\n", + "Compressing /encoder/layer.0/output/dense/MatMul: 96%|▉| 92/96 [03:08<00:07, 1\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 58%|▌| 14/24 [00:28\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 62%|▋| 15/24 [00:30\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 67%|▋| 16/24 [00:32\u001b[A\n", + "Compressing /encoder/layer.0/output/dense/MatMul: 100%|█| 96/96 [03:16<00:00, 2\u001b[A\n", + "\n", + "Compressing /encoder/layer.1/intermediate/dense/MatMul: 0%| | 0/24 [00:00, ?\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 71%|▋| 17/24 [00:35\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 75%|▊| 18/24 [00:37\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 79%|▊| 19/24 [00:39\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 88%|▉| 21/24 [00:43\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 92%|▉| 22/24 [00:45\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 96%|▉| 23/24 [00:47\u001b[A\n", + "Compressing /encoder/layer.1/attention/output/dense/MatMul: 100%|█| 24/24 [00:49\u001b[A\n", + "\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 0%| | 0/96 [00:00, ?it/s]\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 1%| | 1/96 [00:02<03:17, 2.\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 2%| | 2/96 [00:04<03:27, 2.\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 3%| | 3/96 [00:06<03:25, 2.\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 4%| | 4/96 [00:08<03:25, 2.\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 5%| | 5/96 [00:11<03:25, 2.\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 6%| | 6/96 [00:13<03:14, 2.\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 8%| | 8/96 [00:16<02:52, 1.\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 9%| | 9/96 [00:18<02:49, 1.\u001b[A\n", + "Compressing /encoder/layer.1/intermediate/dense/MatMul: 71%|▋| 17/24 [00:35<00:\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 10%| | 10/96 [00:20<02:57, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 11%| | 11/96 [00:23<03:09, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 12%|▏| 12/96 [00:25<03:05, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 14%|▏| 13/96 [00:27<02:54, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 16%|▏| 15/96 [00:31<02:40, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 17%|▏| 16/96 [00:33<02:39, 2\u001b[A\n", + "Compressing /encoder/layer.1/intermediate/dense/MatMul: 100%|█| 24/24 [00:50<00:\u001b[A\n", + "\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 18%|▏| 17/96 [00:35<02:45, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 19%|▏| 18/96 [00:37<02:42, 2\u001b[A\n", + "Compressing /encoder/layer.2/attention/self/key/MatMul: 8%| | 2/24 [00:03<00:4\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 20%|▏| 19/96 [00:40<02:46, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 21%|▏| 20/96 [00:41<02:37, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 23%|▏| 22/96 [00:45<02:27, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 24%|▏| 23/96 [00:47<02:24, 1\u001b[A\n", + "Compressing /encoder/layer.2/attention/self/key/MatMul: 29%|▎| 7/24 [00:13<00:3\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 25%|▎| 24/96 [00:49<02:25, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 26%|▎| 25/96 [00:52<02:29, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 27%|▎| 26/96 [00:54<02:25, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 28%|▎| 27/96 [00:56<02:27, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 29%|▎| 28/96 [00:58<02:24, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 30%|▎| 29/96 [01:00<02:18, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 32%|▎| 31/96 [01:04<02:09, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 33%|▎| 32/96 [01:06<02:06, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 34%|▎| 33/96 [01:08<02:03, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 35%|▎| 34/96 [01:10<02:00, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 36%|▎| 35/96 [01:11<01:56, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 38%|▍| 36/96 [01:13<01:53, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 39%|▍| 37/96 [01:15<01:53, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 40%|▍| 38/96 [01:17<01:54, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 41%|▍| 39/96 [01:19<01:53, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 42%|▍| 40/96 [01:21<01:51, 1\u001b[A\n", + "Compressing /encoder/layer.2/attention/self/key/MatMul: 100%|█| 24/24 [00:49<00:\u001b[A\n", + "\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 44%|▍| 42/96 [01:25<01:45, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 45%|▍| 43/96 [01:27<01:42, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 46%|▍| 44/96 [01:29<01:41, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 47%|▍| 45/96 [01:31<01:38, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 48%|▍| 46/96 [01:33<01:35, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 49%|▍| 47/96 [01:36<01:48, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 50%|▌| 48/96 [01:38<01:48, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 52%|▌| 50/96 [01:42<01:36, 2\u001b[A\n", + "Compressing /encoder/layer.2/attention/self/query/MatMul: 33%|▎| 8/24 [00:18<00\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 53%|▌| 51/96 [01:44<01:35, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 55%|▌| 53/96 [01:48<01:28, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 56%|▌| 54/96 [01:50<01:23, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 57%|▌| 55/96 [01:52<01:18, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 59%|▌| 57/96 [01:55<01:10, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 60%|▌| 58/96 [01:57<01:09, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 61%|▌| 59/96 [01:59<01:07, 1\u001b[A\n", + "Compressing /encoder/layer.2/attention/self/query/MatMul: 67%|▋| 16/24 [00:35<0\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 62%|▋| 60/96 [02:01<01:07, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 64%|▋| 61/96 [02:03<01:06, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 66%|▋| 63/96 [02:07<01:02, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 67%|▋| 64/96 [02:09<01:00, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 68%|▋| 65/96 [02:10<00:58, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 69%|▋| 66/96 [02:13<01:00, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 70%|▋| 67/96 [02:15<01:01, 2\u001b[A\n", + "Compressing /encoder/layer.2/attention/self/query/MatMul: 100%|█| 24/24 [00:52<0\u001b[A\n", + "\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 71%|▋| 68/96 [02:17<01:00, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 72%|▋| 69/96 [02:19<00:57, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 73%|▋| 70/96 [02:21<00:53, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 74%|▋| 71/96 [02:23<00:51, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 76%|▊| 73/96 [02:27<00:46, 2\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 77%|▊| 74/96 [02:29<00:43, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 78%|▊| 75/96 [02:31<00:40, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 79%|▊| 76/96 [02:33<00:37, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 80%|▊| 77/96 [02:35<00:35, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 82%|▊| 79/96 [02:39<00:32, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 83%|▊| 80/96 [02:41<00:30, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 84%|▊| 81/96 [02:42<00:28, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 85%|▊| 82/96 [02:44<00:26, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 86%|▊| 83/96 [02:46<00:24, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 88%|▉| 84/96 [02:48<00:23, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 89%|▉| 85/96 [02:50<00:21, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 90%|▉| 86/96 [02:52<00:19, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 91%|▉| 87/96 [02:54<00:17, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 93%|▉| 89/96 [02:58<00:13, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 94%|▉| 90/96 [03:00<00:11, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 95%|▉| 91/96 [03:02<00:09, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 96%|▉| 92/96 [03:03<00:07, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 97%|▉| 93/96 [03:05<00:05, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 98%|▉| 94/96 [03:07<00:03, 1\u001b[A\n", + "Compressing /encoder/layer.2/attention/self/value/MatMul: 100%|█| 24/24 [00:52<0\u001b[A\n", "\n", - "Compressing /encoder/layer.0/attention/self/query/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 4% 1/24 [00:00<00:18, 1.24it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 8% 2/24 [00:01<00:17, 1.22it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 12% 3/24 [00:02<00:16, 1.24it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 17% 4/24 [00:03<00:16, 1.23it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 21% 5/24 [00:04<00:15, 1.23it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 25% 6/24 [00:04<00:14, 1.24it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 29% 7/24 [00:05<00:14, 1.21it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 33% 8/24 [00:06<00:13, 1.21it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 38% 9/24 [00:07<00:12, 1.20it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 42% 10/24 [00:08<00:11, 1.19it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 46% 11/24 [00:09<00:11, 1.12it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 50% 12/24 [00:10<00:11, 1.06it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 54% 13/24 [00:11<00:10, 1.03it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 58% 14/24 [00:12<00:09, 1.01it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 62% 15/24 [00:13<00:08, 1.00it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 67% 16/24 [00:14<00:08, 1.01s/it]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 71% 17/24 [00:15<00:06, 1.03it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 75% 18/24 [00:16<00:05, 1.10it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 79% 19/24 [00:16<00:04, 1.16it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 83% 20/24 [00:17<00:03, 1.19it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 88% 21/24 [00:18<00:02, 1.21it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 92% 22/24 [00:19<00:01, 1.22it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 96% 23/24 [00:20<00:00, 1.23it/s]\n", - "Compressing /encoder/layer.0/attention/output/dense/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.0/intermediate/dense/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 4% 1/24 [00:00<00:18, 1.23it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 8% 2/24 [00:01<00:17, 1.28it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 12% 3/24 [00:02<00:16, 1.29it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 17% 4/24 [00:03<00:15, 1.27it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 21% 5/24 [00:03<00:14, 1.27it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 25% 6/24 [00:04<00:14, 1.25it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 29% 7/24 [00:05<00:13, 1.23it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 33% 8/24 [00:06<00:13, 1.21it/s]\n", - "Compressing /encoder/layer.0/output/dense/MatMul: 8% 8/96 [00:06<01:12, 1.21it/s]\u001b[A\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 38% 9/24 [00:07<00:12, 1.18it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 42% 10/24 [00:08<00:12, 1.12it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 46% 11/24 [00:09<00:12, 1.06it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 54% 13/24 [00:11<00:10, 1.01it/s]\n", - "Compressing /encoder/layer.0/output/dense/MatMul: 14% 13/96 [00:11<01:22, 1.00it/s]\u001b[A\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 58% 14/24 [00:12<00:10, 1.00s/it]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 62% 15/24 [00:13<00:09, 1.01s/it]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 67% 16/24 [00:14<00:07, 1.01it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 71% 17/24 [00:15<00:06, 1.07it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 75% 18/24 [00:16<00:05, 1.13it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 79% 19/24 [00:16<00:04, 1.18it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 83% 20/24 [00:17<00:03, 1.21it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 88% 21/24 [00:18<00:02, 1.23it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 96% 23/24 [00:19<00:00, 1.27it/s]\n", - "Compressing /encoder/layer.1/attention/self/key/MatMul: 100% 24/24 [00:20<00:00, 1.16it/s]\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 99%|▉| 95/96 [03:09<00:01, 1\u001b[A\n", + "Compressing /encoder/layer.1/output/dense/MatMul: 100%|█| 96/96 [03:11<00:00, 2\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 0%| | 0/24 [00:00, ?\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 4%| | 1/24 [00:02<00:4\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 8%| | 2/24 [00:04<00:4\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 17%|▏| 4/24 [00:07<00:3\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 21%|▏| 5/24 [00:09<00:3\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 25%|▎| 6/24 [00:11<00:3\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 29%|▎| 7/24 [00:13<00:3\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 33%|▎| 8/24 [00:15<00:3\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 38%|▍| 9/24 [00:17<00:2\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 42%|▍| 10/24 [00:19<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 46%|▍| 11/24 [00:21<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 50%|▌| 12/24 [00:23<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 58%|▌| 14/24 [00:27<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 62%|▋| 15/24 [00:29<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 67%|▋| 16/24 [00:31<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 71%|▋| 17/24 [00:33<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 75%|▊| 18/24 [00:34<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 79%|▊| 19/24 [00:36<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 83%|▊| 20/24 [00:38<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 88%|▉| 21/24 [00:40<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 96%|▉| 23/24 [00:44<00:\u001b[A\n", + "Compressing /encoder/layer.2/intermediate/dense/MatMul: 100%|█| 24/24 [00:46<00:\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 0%| | 0/96 [00:00, ?it/s]\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 1%| | 1/96 [00:01<03:00, 1.\u001b[A\n", + "Compressing /encoder/layer.2/attention/output/dense/MatMul: 100%|█| 24/24 [00:51\u001b[A\n", "\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 4% 1/24 [00:00<00:18, 1.25it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 8% 2/24 [00:01<00:17, 1.26it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 12% 3/24 [00:02<00:16, 1.25it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 17% 4/24 [00:03<00:15, 1.26it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 21% 5/24 [00:04<00:15, 1.24it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 25% 6/24 [00:04<00:14, 1.22it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 29% 7/24 [00:05<00:13, 1.22it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 33% 8/24 [00:06<00:13, 1.22it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 38% 9/24 [00:07<00:13, 1.14it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 42% 10/24 [00:08<00:13, 1.07it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 46% 11/24 [00:09<00:12, 1.04it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 50% 12/24 [00:10<00:11, 1.01it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 54% 13/24 [00:11<00:11, 1.01s/it]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 58% 14/24 [00:12<00:10, 1.03s/it]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 62% 15/24 [00:13<00:09, 1.00s/it]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 67% 16/24 [00:14<00:07, 1.06it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 71% 17/24 [00:15<00:06, 1.11it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 75% 18/24 [00:16<00:05, 1.15it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 79% 19/24 [00:17<00:04, 1.04it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 83% 20/24 [00:18<00:03, 1.08it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 88% 21/24 [00:18<00:02, 1.12it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 92% 22/24 [00:19<00:01, 1.16it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 96% 23/24 [00:20<00:00, 1.18it/s]\n", - "Compressing /encoder/layer.1/attention/self/query/MatMul: 100% 24/24 [00:21<00:00, 1.12it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 4% 1/24 [00:00<00:18, 1.26it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 8% 2/24 [00:01<00:17, 1.25it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 12% 3/24 [00:02<00:17, 1.18it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 17% 4/24 [00:03<00:17, 1.17it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 21% 5/24 [00:04<00:16, 1.16it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 25% 6/24 [00:05<00:15, 1.17it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 29% 7/24 [00:06<00:14, 1.13it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 33% 8/24 [00:07<00:14, 1.07it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 38% 9/24 [00:08<00:14, 1.03it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 42% 10/24 [00:09<00:13, 1.01it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 46% 11/24 [00:10<00:13, 1.01s/it]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 50% 12/24 [00:11<00:12, 1.02s/it]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 54% 13/24 [00:12<00:11, 1.03s/it]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 58% 14/24 [00:13<00:09, 1.04it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 62% 15/24 [00:13<00:08, 1.09it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 67% 16/24 [00:14<00:07, 1.14it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 71% 17/24 [00:15<00:06, 1.16it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 75% 18/24 [00:16<00:05, 1.18it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 79% 19/24 [00:17<00:04, 1.20it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 83% 20/24 [00:17<00:03, 1.22it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 88% 21/24 [00:18<00:02, 1.21it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 92% 22/24 [00:19<00:01, 1.21it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 96% 23/24 [00:20<00:00, 1.21it/s]\n", - "Compressing /encoder/layer.1/attention/self/value/MatMul: 100% 24/24 [00:21<00:00, 1.13it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 4% 1/24 [00:00<00:18, 1.24it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 8% 2/24 [00:01<00:18, 1.18it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 12% 3/24 [00:02<00:18, 1.16it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 17% 4/24 [00:03<00:17, 1.16it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 21% 5/24 [00:04<00:16, 1.13it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 25% 6/24 [00:05<00:16, 1.08it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 29% 7/24 [00:06<00:16, 1.03it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 33% 8/24 [00:07<00:15, 1.02it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 38% 9/24 [00:08<00:14, 1.00it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 42% 10/24 [00:09<00:14, 1.01s/it]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 46% 11/24 [00:10<00:13, 1.02s/it]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 50% 12/24 [00:11<00:11, 1.00it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 54% 13/24 [00:12<00:10, 1.07it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 58% 14/24 [00:13<00:08, 1.12it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 62% 15/24 [00:13<00:07, 1.17it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 67% 16/24 [00:14<00:06, 1.19it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 71% 17/24 [00:15<00:05, 1.22it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 75% 18/24 [00:16<00:04, 1.24it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 79% 19/24 [00:16<00:04, 1.25it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 83% 20/24 [00:17<00:03, 1.26it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 88% 21/24 [00:18<00:02, 1.22it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 92% 22/24 [00:19<00:01, 1.17it/s]\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 96% 23/24 [00:20<00:00, 1.20it/s]\n", - "Compressing /encoder/layer.0/output/dense/MatMul: 100% 96/96 [01:23<00:00, 1.14it/s]\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 2%| | 2/96 [00:03<03:02, 1.\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 3%| | 3/96 [00:05<02:58, 1.\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 4%| | 4/96 [00:07<03:05, 2.\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 5%| | 5/96 [00:10<03:07, 2.\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 6%| | 6/96 [00:12<03:09, 2.\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 7%| | 7/96 [00:14<03:02, 2.\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 9%| | 9/96 [00:17<02:49, 1.\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 10%| | 10/96 [00:19<02:46, 1\u001b[A\n", + "Compressing /encoder/layer.3/attention/self/key/MatMul: 33%|▎| 8/24 [00:17<00:3\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 11%| | 11/96 [00:22<02:54, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 14%|▏| 13/96 [00:26<03:03, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 15%|▏| 14/96 [00:28<02:55, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 16%|▏| 15/96 [00:30<02:50, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 17%|▏| 16/96 [00:33<02:50, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 18%|▏| 17/96 [00:35<02:53, 2\u001b[A\n", + "Compressing /encoder/layer.3/attention/self/key/MatMul: 62%|▋| 15/24 [00:33<00:\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 19%|▏| 18/96 [00:37<02:57, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 20%|▏| 19/96 [00:40<02:53, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 21%|▏| 20/96 [00:42<02:44, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 22%|▏| 21/96 [00:43<02:36, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 23%|▏| 22/96 [00:45<02:32, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 24%|▏| 23/96 [00:48<02:31, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 25%|▎| 24/96 [00:49<02:24, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 27%|▎| 26/96 [00:53<02:16, 1\u001b[A\n", + "Compressing /encoder/layer.3/attention/self/key/MatMul: 100%|█| 24/24 [00:51<00:\u001b[A\n", "\n", - "Compressing /encoder/layer.1/attention/output/dense/MatMul: 100% 24/24 [00:21<00:00, 1.13it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 0% 0/96 [00:00, ?it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 1% 1/96 [00:00<01:21, 1.17it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 2% 2/96 [00:01<01:20, 1.16it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 3% 3/96 [00:02<01:20, 1.16it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 4% 4/96 [00:03<01:19, 1.15it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 5% 5/96 [00:04<01:25, 1.06it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 6% 6/96 [00:05<01:27, 1.02it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 7% 7/96 [00:06<01:29, 1.01s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 8% 8/96 [00:07<01:29, 1.02s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 9% 9/96 [00:08<01:30, 1.04s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 10% 10/96 [00:09<01:29, 1.05s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 11% 11/96 [00:10<01:24, 1.01it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 12% 12/96 [00:11<01:18, 1.07it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 14% 13/96 [00:12<01:14, 1.11it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 15% 14/96 [00:13<01:11, 1.15it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 16% 15/96 [00:13<01:08, 1.19it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 17% 16/96 [00:14<01:06, 1.21it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 18% 17/96 [00:15<01:04, 1.22it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 19% 18/96 [00:16<01:03, 1.23it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 20% 19/96 [00:17<01:02, 1.24it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 21% 20/96 [00:17<01:00, 1.25it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 22% 21/96 [00:18<00:59, 1.26it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 23% 22/96 [00:19<00:58, 1.25it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 24% 23/96 [00:20<00:58, 1.25it/s]\n", - "Compressing /encoder/layer.1/intermediate/dense/MatMul: 100% 24/24 [00:21<00:00, 1.13it/s]\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 28%|▎| 27/96 [00:55<02:16, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 29%|▎| 28/96 [00:57<02:13, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 30%|▎| 29/96 [00:59<02:10, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 31%|▎| 30/96 [01:01<02:07, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 32%|▎| 31/96 [01:03<02:07, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 33%|▎| 32/96 [01:05<02:07, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 34%|▎| 33/96 [01:07<02:03, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 35%|▎| 34/96 [01:09<02:00, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 36%|▎| 35/96 [01:11<02:01, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 38%|▍| 36/96 [01:13<02:00, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 39%|▍| 37/96 [01:15<02:04, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 40%|▍| 38/96 [01:18<02:05, 2\u001b[A\n", + "Compressing /encoder/layer.3/attention/self/query/MatMul: 50%|▌| 12/24 [00:24<0\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 41%|▍| 39/96 [01:20<02:03, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 42%|▍| 40/96 [01:22<02:00, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 44%|▍| 42/96 [01:26<01:52, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 45%|▍| 43/96 [01:28<01:48, 2\u001b[A\n", + "Compressing /encoder/layer.3/attention/self/query/MatMul: 71%|▋| 17/24 [00:34<0\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 46%|▍| 44/96 [01:31<01:58, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 47%|▍| 45/96 [01:34<02:06, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 48%|▍| 46/96 [01:36<01:58, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 49%|▍| 47/96 [01:38<01:47, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 50%|▌| 48/96 [01:40<01:42, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 51%|▌| 49/96 [01:42<01:39, 2\u001b[A\n", + "Compressing /encoder/layer.3/attention/self/query/MatMul: 100%|█| 24/24 [00:49<0\u001b[A\n", "\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 25% 24/96 [00:21<00:58, 1.23it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 26% 25/96 [00:21<00:58, 1.22it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 27% 26/96 [00:22<00:57, 1.21it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 28% 27/96 [00:23<00:58, 1.18it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 29% 28/96 [00:24<01:01, 1.10it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 30% 29/96 [00:25<01:03, 1.06it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 31% 30/96 [00:26<01:03, 1.03it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 32% 31/96 [00:27<01:04, 1.00it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 33% 32/96 [00:28<01:04, 1.01s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 34% 33/96 [00:29<01:03, 1.01s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 35% 34/96 [00:30<01:01, 1.01it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 36% 35/96 [00:31<00:56, 1.07it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 38% 36/96 [00:32<00:53, 1.12it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 39% 37/96 [00:33<00:50, 1.16it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 40% 38/96 [00:34<00:48, 1.19it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 41% 39/96 [00:34<00:47, 1.21it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 42% 40/96 [00:35<00:46, 1.21it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 43% 41/96 [00:36<00:44, 1.24it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 44% 42/96 [00:37<00:43, 1.25it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 45% 43/96 [00:38<00:42, 1.25it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 46% 44/96 [00:38<00:41, 1.26it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 47% 45/96 [00:39<00:40, 1.25it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 48% 46/96 [00:40<00:40, 1.24it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 49% 47/96 [00:41<00:39, 1.24it/s]\n", - "Compressing /encoder/layer.2/attention/self/key/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 52%|▌| 50/96 [01:44<01:40, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 53%|▌| 51/96 [01:46<01:37, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 54%|▌| 52/96 [01:48<01:36, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 55%|▌| 53/96 [01:51<01:36, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 56%|▌| 54/96 [01:53<01:34, 2\u001b[A\n", + "Compressing /encoder/layer.3/attention/self/value/MatMul: 21%|▏| 5/24 [00:09<00\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 57%|▌| 55/96 [01:55<01:32, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 58%|▌| 56/96 [01:58<01:31, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 59%|▌| 57/96 [02:00<01:31, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 60%|▌| 58/96 [02:03<01:30, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 61%|▌| 59/96 [02:05<01:25, 2\u001b[A\n", + "Compressing /encoder/layer.3/attention/self/value/MatMul: 46%|▍| 11/24 [00:21<0\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 62%|▋| 60/96 [02:07<01:22, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 64%|▋| 61/96 [02:09<01:19, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 65%|▋| 62/96 [02:12<01:18, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 66%|▋| 63/96 [02:14<01:13, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 67%|▋| 64/96 [02:16<01:09, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 68%|▋| 65/96 [02:18<01:04, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 69%|▋| 66/96 [02:20<01:03, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 70%|▋| 67/96 [02:22<01:03, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 72%|▋| 69/96 [02:26<00:53, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 73%|▋| 70/96 [02:27<00:50, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 74%|▋| 71/96 [02:30<00:49, 1\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 75%|▊| 72/96 [02:32<00:50, 2\u001b[A\n", + "Compressing /encoder/layer.3/attention/self/value/MatMul: 100%|█| 24/24 [00:48<0\u001b[A\n", "\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 50% 48/96 [00:42<00:39, 1.22it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 51% 49/96 [00:42<00:39, 1.20it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 52% 50/96 [00:43<00:38, 1.19it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 53% 51/96 [00:44<00:40, 1.11it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 54% 52/96 [00:45<00:41, 1.06it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 55% 53/96 [00:46<00:42, 1.02it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 56% 54/96 [00:47<00:42, 1.00s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 57% 55/96 [00:49<00:41, 1.02s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 58% 56/96 [00:50<00:40, 1.02s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 59% 57/96 [00:50<00:38, 1.02it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 60% 58/96 [00:51<00:35, 1.08it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 61% 59/96 [00:52<00:32, 1.12it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 62% 60/96 [00:53<00:31, 1.16it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 64% 61/96 [00:54<00:29, 1.18it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 65% 62/96 [00:54<00:28, 1.20it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 66% 63/96 [00:55<00:27, 1.21it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 67% 64/96 [00:56<00:26, 1.22it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 68% 65/96 [00:57<00:25, 1.23it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 69% 66/96 [00:58<00:24, 1.24it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 70% 67/96 [00:58<00:23, 1.25it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 71% 68/96 [01:00<00:25, 1.12it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 72% 69/96 [01:00<00:23, 1.14it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 73% 70/96 [01:01<00:22, 1.15it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 74% 71/96 [01:02<00:21, 1.16it/s]\n", - "Compressing /encoder/layer.2/attention/self/query/MatMul: 100% 24/24 [00:21<00:00, 1.12it/s]\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 76%|▊| 73/96 [02:34<00:49, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 77%|▊| 74/96 [02:37<00:48, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 78%|▊| 75/96 [02:39<00:48, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 79%|▊| 76/96 [02:41<00:44, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 80%|▊| 77/96 [02:43<00:42, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 81%|▊| 78/96 [02:45<00:38, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 82%|▊| 79/96 [02:48<00:37, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 83%|▊| 80/96 [02:50<00:36, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 84%|▊| 81/96 [02:52<00:31, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 85%|▊| 82/96 [02:54<00:29, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 86%|▊| 83/96 [02:57<00:30, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 88%|▉| 84/96 [03:00<00:30, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 89%|▉| 85/96 [03:02<00:27, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 90%|▉| 86/96 [03:05<00:24, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 91%|▉| 87/96 [03:07<00:21, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 93%|▉| 89/96 [03:11<00:15, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 94%|▉| 90/96 [03:13<00:13, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 95%|▉| 91/96 [03:16<00:11, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 96%|▉| 92/96 [03:18<00:09, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 97%|▉| 93/96 [03:20<00:06, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 98%|▉| 94/96 [03:22<00:04, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 99%|▉| 95/96 [03:25<00:02, 2\u001b[A\n", + "Compressing /encoder/layer.2/output/dense/MatMul: 100%|█| 96/96 [03:27<00:00, 2\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 0%| | 0/24 [00:00, ?\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 4%| | 1/24 [00:02<00:5\u001b[A\n", + "Compressing /encoder/layer.3/attention/output/dense/MatMul: 100%|█| 24/24 [00:59\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 8%| | 2/24 [00:04<00:5\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 12%|▏| 3/24 [00:06<00:4\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 17%|▏| 4/24 [00:08<00:4\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 21%|▏| 5/24 [00:11<00:4\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 25%|▎| 6/24 [00:13<00:4\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 29%|▎| 7/24 [00:15<00:3\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 33%|▎| 8/24 [00:18<00:3\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 38%|▍| 9/24 [00:20<00:3\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 42%|▍| 10/24 [00:22<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 50%|▌| 12/24 [00:27<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 54%|▌| 13/24 [00:29<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 58%|▌| 14/24 [00:31<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 62%|▋| 15/24 [00:34<00:\u001b[A\n", + "Compressing /encoder/layer.3/output/dense/MatMul: 12%|▏| 12/96 [00:29<03:22, 2\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 71%|▋| 17/24 [00:38<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 75%|▊| 18/24 [00:40<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 79%|▊| 19/24 [00:43<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 83%|▊| 20/24 [00:45<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 88%|▉| 21/24 [00:48<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 92%|▉| 22/24 [00:50<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 96%|▉| 23/24 [00:52<00:\u001b[A\n", + "Compressing /encoder/layer.3/intermediate/dense/MatMul: 100%|█| 24/24 [00:55<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 0%| | 0/24 [00:00, ?\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 4%| | 1/24 [00:02<00:4\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 8%| | 2/24 [00:04<00:4\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 12%|▏| 3/24 [00:06<00:4\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 17%|▏| 4/24 [00:08<00:4\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 21%|▏| 5/24 [00:11<00:4\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 25%|▎| 6/24 [00:13<00:4\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 29%|▎| 7/24 [00:15<00:3\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 33%|▎| 8/24 [00:18<00:3\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 38%|▍| 9/24 [00:21<00:3\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 42%|▍| 10/24 [00:23<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 46%|▍| 11/24 [00:25<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 50%|▌| 12/24 [00:28<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 54%|▌| 13/24 [00:30<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 62%|▋| 15/24 [00:35<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 67%|▋| 16/24 [00:37<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 71%|▋| 17/24 [00:39<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 75%|▊| 18/24 [00:42<00:\u001b[A\n", + "Compressing /encoder/layer.3/output/dense/MatMul: 40%|▍| 38/96 [01:32<02:18, 2\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 79%|▊| 19/24 [00:44<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 83%|▊| 20/24 [00:47<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 88%|▉| 21/24 [00:49<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 92%|▉| 22/24 [00:52<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 96%|▉| 23/24 [00:54<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/key/MatMul: 100%|█| 24/24 [00:57<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 0%| | 0/24 [00:00,\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 4%| | 1/24 [00:02<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 8%| | 2/24 [00:04<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 12%|▏| 3/24 [00:06<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 17%|▏| 4/24 [00:09<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 21%|▏| 5/24 [00:11<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 25%|▎| 6/24 [00:13<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 29%|▎| 7/24 [00:16<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 33%|▎| 8/24 [00:19<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 38%|▍| 9/24 [00:21<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 42%|▍| 10/24 [00:24<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 46%|▍| 11/24 [00:26<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 50%|▌| 12/24 [00:28<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 54%|▌| 13/24 [00:31<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 58%|▌| 14/24 [00:33<0\u001b[A\n", + "Compressing /encoder/layer.3/output/dense/MatMul: 61%|▌| 59/96 [02:23<01:28, 2\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 62%|▋| 15/24 [00:37<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 71%|▋| 17/24 [00:41<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 75%|▊| 18/24 [00:43<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 79%|▊| 19/24 [00:46<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 83%|▊| 20/24 [00:48<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 88%|▉| 21/24 [00:51<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 92%|▉| 22/24 [00:53<0\u001b[A\n", + "Compressing /encoder/layer.3/output/dense/MatMul: 70%|▋| 67/96 [02:42<01:07, 2\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/query/MatMul: 100%|█| 24/24 [00:58<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 0%| | 0/24 [00:00,\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 4%| | 1/24 [00:02<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 8%| | 2/24 [00:04<00\u001b[A\n", + "Compressing /encoder/layer.3/output/dense/MatMul: 74%|▋| 71/96 [02:52<01:03, 2\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 17%|▏| 4/24 [00:09<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 21%|▏| 5/24 [00:11<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 25%|▎| 6/24 [00:14<00\u001b[A\n", + "Compressing /encoder/layer.3/output/dense/MatMul: 78%|▊| 75/96 [03:01<00:48, 2\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 29%|▎| 7/24 [00:16<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 33%|▎| 8/24 [00:19<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 38%|▍| 9/24 [00:21<00\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 42%|▍| 10/24 [00:23<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 46%|▍| 11/24 [00:25<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 50%|▌| 12/24 [00:27<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 58%|▌| 14/24 [00:31<0\u001b[A\n", + "Compressing /encoder/layer.3/output/dense/MatMul: 86%|▊| 83/96 [03:18<00:28, 2\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 62%|▋| 15/24 [00:34<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 67%|▋| 16/24 [00:36<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 71%|▋| 17/24 [00:39<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 75%|▊| 18/24 [00:41<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 79%|▊| 19/24 [00:43<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 83%|▊| 20/24 [00:45<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 88%|▉| 21/24 [00:47<0\u001b[A\n", + "Compressing /encoder/layer.3/output/dense/MatMul: 95%|▉| 91/96 [03:35<00:09, 1\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 92%|▉| 22/24 [00:50<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 96%|▉| 23/24 [00:52<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/self/value/MatMul: 100%|█| 24/24 [00:54<0\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 0%| | 0/24 [00:00<\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 4%| | 1/24 [00:02<\u001b[A\n", + "Compressing /encoder/layer.3/output/dense/MatMul: 100%|█| 96/96 [03:46<00:00, 2\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 8%| | 2/24 [00:03<\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 12%|▏| 3/24 [00:06<\u001b[A\n", + "Compressing /encoder/layer.4/intermediate/dense/MatMul: 4%| | 1/24 [00:02<00:4\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 17%|▏| 4/24 [00:08<\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 21%|▏| 5/24 [00:10<\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 29%|▎| 7/24 [00:14<\u001b[A\n", + "Compressing /encoder/layer.4/intermediate/dense/MatMul: 21%|▏| 5/24 [00:10<00:4\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 33%|▎| 8/24 [00:16<\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 38%|▍| 9/24 [00:19<\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 42%|▍| 10/24 [00:21\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 46%|▍| 11/24 [00:23\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 50%|▌| 12/24 [00:25\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 54%|▌| 13/24 [00:27\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 58%|▌| 14/24 [00:29\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 62%|▋| 15/24 [00:31\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 67%|▋| 16/24 [00:33\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 71%|▋| 17/24 [00:36\u001b[A\n", + "Compressing /encoder/layer.4/intermediate/dense/MatMul: 67%|▋| 16/24 [00:32<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 79%|▊| 19/24 [00:41\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 83%|▊| 20/24 [00:42\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 88%|▉| 21/24 [00:44\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 92%|▉| 22/24 [00:47\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 96%|▉| 23/24 [00:49\u001b[A\n", + "Compressing /encoder/layer.4/intermediate/dense/MatMul: 92%|▉| 22/24 [00:44<00:\u001b[A\n", + "Compressing /encoder/layer.4/attention/output/dense/MatMul: 100%|█| 24/24 [00:51\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 0%| | 0/96 [00:00, ?it/s]\n", + "Compressing /encoder/layer.4/intermediate/dense/MatMul: 100%|█| 24/24 [00:49<00:\u001b[A\n", "\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 75% 72/96 [01:03<00:20, 1.15it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 76% 73/96 [01:04<00:20, 1.11it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 77% 74/96 [01:05<00:20, 1.05it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 78% 75/96 [01:06<00:20, 1.02it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 79% 76/96 [01:07<00:20, 1.00s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 80% 77/96 [01:08<00:19, 1.02s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 81% 78/96 [01:09<00:18, 1.03s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 82% 79/96 [01:10<00:17, 1.03s/it]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 83% 80/96 [01:11<00:15, 1.05it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 84% 81/96 [01:12<00:13, 1.10it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 85% 82/96 [01:13<00:12, 1.14it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 86% 83/96 [01:13<00:11, 1.17it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 88% 84/96 [01:14<00:09, 1.20it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 89% 85/96 [01:15<00:08, 1.23it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 90% 86/96 [01:16<00:08, 1.24it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 91% 87/96 [01:17<00:07, 1.25it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 92% 88/96 [01:17<00:06, 1.24it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 93% 89/96 [01:18<00:05, 1.24it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 94% 90/96 [01:19<00:04, 1.24it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 95% 91/96 [01:20<00:04, 1.24it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 96% 92/96 [01:21<00:03, 1.21it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 97% 93/96 [01:22<00:02, 1.20it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 98% 94/96 [01:22<00:01, 1.19it/s]\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 99% 95/96 [01:23<00:00, 1.19it/s]\n", - "Compressing /encoder/layer.2/attention/self/value/MatMul: 100% 24/24 [00:21<00:00, 1.13it/s]\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 1%| | 1/96 [00:02<03:35, 2.\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 2%| | 2/96 [00:04<03:38, 2.\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 3%| | 3/96 [00:06<03:31, 2.\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 4%| | 4/96 [00:08<03:20, 2.\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 5%| | 5/96 [00:10<03:14, 2.\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 6%| | 6/96 [00:13<03:17, 2.\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 7%| | 7/96 [00:15<03:20, 2.\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 8%| | 8/96 [00:17<03:17, 2.\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 9%| | 9/96 [00:19<03:06, 2.\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 10%| | 10/96 [00:21<03:04, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 11%| | 11/96 [00:24<03:12, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 12%|▏| 12/96 [00:26<03:09, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 14%|▏| 13/96 [00:29<03:20, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 15%|▏| 14/96 [00:31<03:20, 2\u001b[A\n", + "Compressing /encoder/layer.5/attention/self/key/MatMul: 58%|▌| 14/24 [00:30<00:\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 16%|▏| 15/96 [00:33<03:07, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 17%|▏| 16/96 [00:35<02:56, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 19%|▏| 18/96 [00:39<02:40, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 20%|▏| 19/96 [00:42<02:43, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 21%|▏| 20/96 [00:43<02:34, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 22%|▏| 21/96 [00:45<02:30, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 23%|▏| 22/96 [00:47<02:28, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 24%|▏| 23/96 [00:49<02:26, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 25%|▎| 24/96 [00:52<02:29, 2\u001b[A\n", + "Compressing /encoder/layer.5/attention/self/key/MatMul: 100%|█| 24/24 [00:50<00:\u001b[A\n", "\n", - "Compressing /encoder/layer.1/output/dense/MatMul: 100% 96/96 [01:24<00:00, 1.13it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 4% 1/24 [00:01<00:24, 1.08s/it]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 8% 2/24 [00:02<00:23, 1.06s/it]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 12% 3/24 [00:03<00:22, 1.06s/it]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 17% 4/24 [00:04<00:20, 1.05s/it]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 21% 5/24 [00:05<00:19, 1.04s/it]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 25% 6/24 [00:06<00:18, 1.02s/it]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 29% 7/24 [00:07<00:16, 1.06it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 33% 8/24 [00:07<00:14, 1.11it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 38% 9/24 [00:08<00:12, 1.16it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 42% 10/24 [00:09<00:11, 1.19it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 46% 11/24 [00:10<00:10, 1.21it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 50% 12/24 [00:10<00:09, 1.22it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 54% 13/24 [00:11<00:08, 1.23it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 58% 14/24 [00:12<00:08, 1.24it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 62% 15/24 [00:13<00:07, 1.24it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 67% 16/24 [00:14<00:06, 1.25it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 71% 17/24 [00:14<00:05, 1.25it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 75% 18/24 [00:15<00:04, 1.25it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 79% 19/24 [00:16<00:04, 1.22it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 83% 20/24 [00:17<00:03, 1.18it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 88% 21/24 [00:18<00:02, 1.18it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 92% 22/24 [00:19<00:01, 1.16it/s]\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 96% 23/24 [00:20<00:00, 1.11it/s]\n", - "Compressing /encoder/layer.2/attention/output/dense/MatMul: 100% 24/24 [00:21<00:00, 1.12it/s]\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 26%|▎| 25/96 [00:54<02:31, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 27%|▎| 26/96 [00:56<02:25, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 28%|▎| 27/96 [00:58<02:22, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 29%|▎| 28/96 [01:00<02:18, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 30%|▎| 29/96 [01:02<02:23, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 31%|▎| 30/96 [01:05<02:31, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 32%|▎| 31/96 [01:07<02:31, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 33%|▎| 32/96 [01:10<02:32, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 34%|▎| 33/96 [01:12<02:28, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 35%|▎| 34/96 [01:14<02:21, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 36%|▎| 35/96 [01:16<02:16, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 38%|▍| 36/96 [01:19<02:16, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 39%|▍| 37/96 [01:21<02:16, 2\u001b[A\n", + "Compressing /encoder/layer.5/attention/self/query/MatMul: 54%|▌| 13/24 [00:28<0\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 40%|▍| 38/96 [01:23<02:14, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 41%|▍| 39/96 [01:26<02:16, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 42%|▍| 40/96 [01:28<02:13, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 43%|▍| 41/96 [01:30<02:07, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 44%|▍| 42/96 [01:33<02:02, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 45%|▍| 43/96 [01:35<01:59, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 46%|▍| 44/96 [01:37<01:56, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 47%|▍| 45/96 [01:40<02:01, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 48%|▍| 46/96 [01:42<02:01, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 49%|▍| 47/96 [01:45<01:56, 2\u001b[A\n", + "Compressing /encoder/layer.5/attention/self/query/MatMul: 100%|█| 24/24 [00:52<0\u001b[A\n", "\n", - "Compressing /encoder/layer.2/intermediate/dense/MatMul: 100% 24/24 [00:21<00:00, 1.12it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 4% 1/24 [00:01<00:24, 1.07s/it]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 8% 2/24 [00:02<00:23, 1.07s/it]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 12% 3/24 [00:03<00:22, 1.06s/it]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 17% 4/24 [00:04<00:20, 1.04s/it]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 21% 5/24 [00:05<00:19, 1.02s/it]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 25% 6/24 [00:06<00:17, 1.05it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 29% 7/24 [00:06<00:15, 1.11it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 33% 8/24 [00:07<00:13, 1.15it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 38% 9/24 [00:08<00:12, 1.18it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 42% 10/24 [00:09<00:11, 1.18it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 46% 11/24 [00:10<00:10, 1.21it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 50% 12/24 [00:10<00:09, 1.24it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 54% 13/24 [00:11<00:08, 1.25it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 58% 14/24 [00:12<00:08, 1.24it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 62% 15/24 [00:13<00:07, 1.24it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 67% 16/24 [00:13<00:06, 1.25it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 71% 17/24 [00:14<00:05, 1.26it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 75% 18/24 [00:15<00:04, 1.24it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 79% 19/24 [00:16<00:04, 1.22it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 83% 20/24 [00:17<00:03, 1.19it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 88% 21/24 [00:18<00:02, 1.19it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 92% 22/24 [00:19<00:01, 1.13it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 96% 23/24 [00:20<00:00, 1.07it/s]\n", - "Compressing /encoder/layer.3/attention/self/key/MatMul: 100% 24/24 [00:21<00:00, 1.13it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 4% 1/24 [00:01<00:24, 1.06s/it]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 8% 2/24 [00:02<00:22, 1.04s/it]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 12% 3/24 [00:03<00:21, 1.03s/it]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 17% 4/24 [00:04<00:20, 1.01s/it]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 21% 5/24 [00:04<00:17, 1.08it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 25% 6/24 [00:05<00:15, 1.14it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 29% 7/24 [00:06<00:14, 1.15it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 33% 8/24 [00:07<00:13, 1.18it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 38% 9/24 [00:08<00:12, 1.19it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 42% 10/24 [00:08<00:11, 1.21it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 46% 11/24 [00:09<00:10, 1.23it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 50% 12/24 [00:10<00:09, 1.24it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 54% 13/24 [00:11<00:08, 1.25it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 58% 14/24 [00:12<00:07, 1.25it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 62% 15/24 [00:12<00:07, 1.26it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 67% 16/24 [00:13<00:06, 1.26it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 71% 17/24 [00:14<00:05, 1.24it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 75% 18/24 [00:15<00:04, 1.22it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 79% 19/24 [00:16<00:04, 1.21it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 83% 20/24 [00:17<00:03, 1.20it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 88% 21/24 [00:17<00:02, 1.14it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 92% 22/24 [00:19<00:01, 1.08it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 96% 23/24 [00:20<00:00, 1.04it/s]\n", - "Compressing /encoder/layer.3/attention/self/query/MatMul: 100% 24/24 [00:21<00:00, 1.14it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 4% 1/24 [00:01<00:23, 1.01s/it]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 8% 2/24 [00:02<00:22, 1.03s/it]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 12% 3/24 [00:03<00:20, 1.00it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 17% 4/24 [00:03<00:18, 1.09it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 21% 5/24 [00:04<00:16, 1.14it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 25% 6/24 [00:05<00:15, 1.18it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 29% 7/24 [00:06<00:14, 1.20it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 33% 8/24 [00:07<00:13, 1.21it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 38% 9/24 [00:07<00:12, 1.22it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 42% 10/24 [00:08<00:11, 1.25it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 46% 11/24 [00:09<00:10, 1.27it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 50% 12/24 [00:10<00:09, 1.26it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 54% 13/24 [00:10<00:08, 1.26it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 58% 14/24 [00:11<00:07, 1.27it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 62% 15/24 [00:12<00:07, 1.27it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 67% 16/24 [00:13<00:06, 1.25it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 71% 17/24 [00:14<00:05, 1.23it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 75% 18/24 [00:15<00:05, 1.17it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 79% 19/24 [00:15<00:04, 1.17it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 83% 20/24 [00:16<00:03, 1.12it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 88% 21/24 [00:17<00:02, 1.07it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 92% 22/24 [00:18<00:01, 1.05it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 96% 23/24 [00:20<00:00, 1.02it/s]\n", - "Compressing /encoder/layer.3/attention/self/value/MatMul: 100% 24/24 [00:21<00:00, 1.14it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 4% 1/24 [00:01<00:23, 1.00s/it]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 8% 2/24 [00:01<00:21, 1.01it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 12% 3/24 [00:02<00:18, 1.11it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 17% 4/24 [00:03<00:17, 1.17it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 21% 5/24 [00:04<00:15, 1.21it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 25% 6/24 [00:05<00:14, 1.23it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 29% 7/24 [00:05<00:13, 1.23it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 33% 8/24 [00:06<00:12, 1.25it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 38% 9/24 [00:07<00:11, 1.25it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 42% 10/24 [00:08<00:11, 1.26it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 46% 11/24 [00:09<00:10, 1.28it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 50% 12/24 [00:09<00:09, 1.28it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 54% 13/24 [00:10<00:08, 1.28it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 58% 14/24 [00:11<00:07, 1.29it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 62% 15/24 [00:12<00:07, 1.26it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 67% 16/24 [00:13<00:06, 1.22it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 71% 17/24 [00:13<00:05, 1.20it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 75% 18/24 [00:14<00:04, 1.20it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 79% 19/24 [00:15<00:04, 1.17it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 83% 20/24 [00:16<00:03, 1.10it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 88% 21/24 [00:17<00:02, 1.05it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 92% 22/24 [00:18<00:01, 1.02it/s]\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 96% 23/24 [00:19<00:01, 1.00s/it]\n", - "Compressing /encoder/layer.2/output/dense/MatMul: 100% 96/96 [01:24<00:00, 1.14it/s]\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 50%|▌| 48/96 [01:47<01:53, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 51%|▌| 49/96 [01:49<01:48, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 52%|▌| 50/96 [01:51<01:45, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 53%|▌| 51/96 [01:54<01:43, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 54%|▌| 52/96 [01:56<01:41, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 55%|▌| 53/96 [01:58<01:38, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 56%|▌| 54/96 [02:00<01:34, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 57%|▌| 55/96 [02:02<01:30, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 58%|▌| 56/96 [02:04<01:23, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 59%|▌| 57/96 [02:06<01:21, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 60%|▌| 58/96 [02:09<01:21, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 61%|▌| 59/96 [02:11<01:21, 2\u001b[A\n", + "Compressing /encoder/layer.5/attention/self/value/MatMul: 50%|▌| 12/24 [00:25<0\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 62%|▋| 60/96 [02:13<01:17, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 64%|▋| 61/96 [02:15<01:14, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 65%|▋| 62/96 [02:17<01:10, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 66%|▋| 63/96 [02:19<01:07, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 67%|▋| 64/96 [02:21<01:07, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 68%|▋| 65/96 [02:23<01:04, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 69%|▋| 66/96 [02:25<01:02, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 70%|▋| 67/96 [02:27<00:59, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 71%|▋| 68/96 [02:29<00:57, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 72%|▋| 69/96 [02:31<00:54, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 73%|▋| 70/96 [02:33<00:52, 2\u001b[A\n", + "Compressing /encoder/layer.5/attention/self/value/MatMul: 100%|█| 24/24 [00:48<0\u001b[A\n", "\n", - "Compressing /encoder/layer.3/attention/output/dense/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 0% 0/96 [00:00, ?it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 1% 1/96 [00:01<01:37, 1.03s/it]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 2% 2/96 [00:01<01:23, 1.12it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 3% 3/96 [00:02<01:18, 1.19it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 4% 4/96 [00:03<01:15, 1.22it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 5% 5/96 [00:04<01:13, 1.24it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 6% 6/96 [00:04<01:12, 1.24it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 7% 7/96 [00:05<01:11, 1.25it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 8% 8/96 [00:06<01:09, 1.26it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 9% 9/96 [00:07<01:08, 1.27it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 10% 10/96 [00:08<01:08, 1.26it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 11% 11/96 [00:08<01:07, 1.26it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 12% 12/96 [00:09<01:07, 1.25it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 14% 13/96 [00:10<01:05, 1.26it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 15% 14/96 [00:11<01:05, 1.26it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 16% 15/96 [00:12<01:05, 1.24it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 17% 16/96 [00:12<01:05, 1.22it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 18% 17/96 [00:13<01:04, 1.22it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 19% 18/96 [00:14<01:05, 1.19it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 20% 19/96 [00:15<01:08, 1.12it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 21% 20/96 [00:16<01:10, 1.07it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 22% 21/96 [00:17<01:11, 1.05it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 23% 22/96 [00:18<01:12, 1.02it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 24% 23/96 [00:19<01:12, 1.00it/s]\n", - "Compressing /encoder/layer.3/intermediate/dense/MatMul: 100% 24/24 [00:20<00:00, 1.14it/s]\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 74%|▋| 71/96 [02:35<00:50, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 75%|▊| 72/96 [02:38<00:50, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 76%|▊| 73/96 [02:40<00:52, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 77%|▊| 74/96 [02:43<00:49, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 78%|▊| 75/96 [02:45<00:45, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 79%|▊| 76/96 [02:47<00:45, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 80%|▊| 77/96 [02:49<00:41, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 81%|▊| 78/96 [02:51<00:39, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 82%|▊| 79/96 [02:54<00:38, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 83%|▊| 80/96 [02:56<00:35, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 84%|▊| 81/96 [02:58<00:35, 2\u001b[A\n", + "Compressing /encoder/layer.5/attention/output/dense/MatMul: 46%|▍| 11/24 [00:23\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 86%|▊| 83/96 [03:03<00:28, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 88%|▉| 84/96 [03:05<00:26, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 89%|▉| 85/96 [03:07<00:24, 2\u001b[A\n", + "Compressing /encoder/layer.5/attention/output/dense/MatMul: 62%|▋| 15/24 [00:32\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 90%|▉| 86/96 [03:10<00:22, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 91%|▉| 87/96 [03:12<00:19, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 92%|▉| 88/96 [03:13<00:16, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 93%|▉| 89/96 [03:16<00:15, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 94%|▉| 90/96 [03:18<00:12, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 95%|▉| 91/96 [03:20<00:10, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 96%|▉| 92/96 [03:22<00:08, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 97%|▉| 93/96 [03:25<00:06, 2\u001b[A\n", + "Compressing /encoder/layer.5/attention/output/dense/MatMul: 100%|█| 24/24 [00:50\u001b[A\n", "\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 25% 24/96 [00:20<01:11, 1.00it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 26% 25/96 [00:21<01:08, 1.04it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 27% 26/96 [00:22<01:04, 1.09it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 28% 27/96 [00:23<01:00, 1.13it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 29% 28/96 [00:24<00:57, 1.18it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 30% 29/96 [00:24<00:55, 1.20it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 31% 30/96 [00:25<00:53, 1.23it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 32% 31/96 [00:26<00:52, 1.24it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 33% 32/96 [00:27<00:51, 1.25it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 34% 33/96 [00:28<00:49, 1.26it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 35% 34/96 [00:28<00:49, 1.26it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 36% 35/96 [00:29<00:48, 1.27it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 38% 36/96 [00:30<00:46, 1.28it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 39% 37/96 [00:31<00:46, 1.28it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 40% 38/96 [00:31<00:46, 1.25it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 41% 39/96 [00:32<00:46, 1.23it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 42% 40/96 [00:33<00:46, 1.22it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 43% 41/96 [00:34<00:46, 1.19it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 44% 42/96 [00:35<00:47, 1.14it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 45% 43/96 [00:36<00:49, 1.08it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 46% 44/96 [00:37<00:49, 1.05it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 47% 45/96 [00:38<00:50, 1.02it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 48% 46/96 [00:39<00:49, 1.01it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 49% 47/96 [00:40<00:48, 1.00it/s]\n", - "Compressing /encoder/layer.4/attention/self/key/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 98%|▉| 94/96 [03:27<00:04, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 99%|▉| 95/96 [03:29<00:02, 2\u001b[A\n", + "Compressing /encoder/layer.4/output/dense/MatMul: 100%|█| 96/96 [03:31<00:00, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 0%| | 0/96 [00:00, ?it/s]\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 1%| | 1/96 [00:02<03:22, 2.\u001b[A\n", + "Compressing /encoder/layer.5/intermediate/dense/MatMul: 17%|▏| 4/24 [00:08<00:4\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 2%| | 2/96 [00:04<03:45, 2.\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 3%| | 3/96 [00:07<03:53, 2.\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 4%| | 4/96 [00:09<03:32, 2.\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 5%| | 5/96 [00:11<03:19, 2.\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 6%| | 6/96 [00:13<03:14, 2.\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 7%| | 7/96 [00:15<03:11, 2.\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 8%| | 8/96 [00:17<03:08, 2.\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 9%| | 9/96 [00:20<03:15, 2.\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 10%| | 10/96 [00:22<03:20, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 11%| | 11/96 [00:25<03:22, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 12%|▏| 12/96 [00:27<03:22, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 14%|▏| 13/96 [00:29<03:17, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 15%|▏| 14/96 [00:31<03:02, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 16%|▏| 15/96 [00:33<02:54, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 17%|▏| 16/96 [00:36<03:00, 2\u001b[A\n", + "Compressing /encoder/layer.5/intermediate/dense/MatMul: 83%|▊| 20/24 [00:43<00:\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 19%|▏| 18/96 [00:40<02:53, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 20%|▏| 19/96 [00:42<02:46, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 21%|▏| 20/96 [00:44<02:39, 2\u001b[A\n", + "Compressing /encoder/layer.5/intermediate/dense/MatMul: 100%|█| 24/24 [00:51<00:\u001b[A\n", "\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 50% 48/96 [00:41<00:47, 1.00it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 51% 49/96 [00:42<00:44, 1.06it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 52% 50/96 [00:43<00:41, 1.10it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 53% 51/96 [00:44<00:38, 1.16it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 54% 52/96 [00:44<00:37, 1.18it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 55% 53/96 [00:45<00:35, 1.21it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 56% 54/96 [00:46<00:34, 1.22it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 57% 55/96 [00:47<00:33, 1.21it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 58% 56/96 [00:48<00:32, 1.23it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 59% 57/96 [00:48<00:31, 1.24it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 60% 58/96 [00:49<00:30, 1.24it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 61% 59/96 [00:50<00:29, 1.24it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 62% 60/96 [00:51<00:28, 1.25it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 64% 61/96 [00:52<00:29, 1.17it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 65% 62/96 [00:53<00:29, 1.17it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 66% 63/96 [00:53<00:28, 1.17it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 67% 64/96 [00:54<00:27, 1.18it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 68% 65/96 [00:55<00:26, 1.17it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 69% 66/96 [00:56<00:27, 1.09it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 70% 67/96 [00:57<00:27, 1.04it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 71% 68/96 [00:58<00:27, 1.02it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 72% 69/96 [00:59<00:26, 1.01it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 73% 70/96 [01:00<00:25, 1.00it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 74% 71/96 [01:01<00:25, 1.02s/it]\n", - "Compressing /encoder/layer.4/attention/self/query/MatMul: 100% 24/24 [00:21<00:00, 1.13it/s]\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 22%|▏| 21/96 [00:47<02:40, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 23%|▏| 22/96 [00:49<02:49, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 24%|▏| 23/96 [00:52<02:50, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/self/key/MatMul: 12%|▏| 3/24 [00:06<00:4\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 25%|▎| 24/96 [00:54<02:48, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 27%|▎| 26/96 [00:58<02:34, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 28%|▎| 27/96 [01:00<02:20, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 29%|▎| 28/96 [01:02<02:14, 1\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 30%|▎| 29/96 [01:04<02:13, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 31%|▎| 30/96 [01:05<02:05, 1\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 33%|▎| 32/96 [01:09<01:57, 1\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 34%|▎| 33/96 [01:11<02:03, 1\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 35%|▎| 34/96 [01:13<02:09, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/self/key/MatMul: 54%|▌| 13/24 [00:27<00:\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 36%|▎| 35/96 [01:16<02:09, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 39%|▍| 37/96 [01:20<02:02, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 40%|▍| 38/96 [01:22<02:02, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 41%|▍| 39/96 [01:24<01:59, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 42%|▍| 40/96 [01:26<01:53, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 43%|▍| 41/96 [01:28<01:50, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 44%|▍| 42/96 [01:30<01:46, 1\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 45%|▍| 43/96 [01:32<01:47, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 46%|▍| 44/96 [01:34<01:47, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 47%|▍| 45/96 [01:36<01:44, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/self/key/MatMul: 100%|█| 24/24 [00:52<00:\u001b[A\n", "\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 75% 72/96 [01:02<00:23, 1.02it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 76% 73/96 [01:03<00:21, 1.07it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 77% 74/96 [01:04<00:19, 1.11it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 78% 75/96 [01:05<00:18, 1.15it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 79% 76/96 [01:06<00:16, 1.18it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 80% 77/96 [01:06<00:15, 1.20it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 81% 78/96 [01:07<00:14, 1.21it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 82% 79/96 [01:08<00:13, 1.22it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 83% 80/96 [01:09<00:13, 1.23it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 84% 81/96 [01:09<00:11, 1.26it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 85% 82/96 [01:10<00:11, 1.26it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 86% 83/96 [01:11<00:10, 1.27it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 88% 84/96 [01:12<00:09, 1.25it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 89% 85/96 [01:13<00:08, 1.22it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 90% 86/96 [01:14<00:08, 1.21it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 91% 87/96 [01:14<00:07, 1.20it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 92% 88/96 [01:15<00:06, 1.20it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 93% 89/96 [01:16<00:06, 1.11it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 94% 90/96 [01:17<00:05, 1.06it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 95% 91/96 [01:18<00:04, 1.04it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 96% 92/96 [01:19<00:03, 1.02it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 97% 93/96 [01:20<00:02, 1.01it/s]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 98% 94/96 [01:21<00:02, 1.01s/it]\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 99% 95/96 [01:22<00:00, 1.01it/s]\n", - "Compressing /encoder/layer.4/attention/self/value/MatMul: 100% 24/24 [00:20<00:00, 1.16it/s]\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 48%|▍| 46/96 [01:38<01:46, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 49%|▍| 47/96 [01:41<01:51, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 50%|▌| 48/96 [01:44<01:54, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 51%|▌| 49/96 [01:46<01:52, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 52%|▌| 50/96 [01:48<01:49, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 53%|▌| 51/96 [01:51<01:49, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 54%|▌| 52/96 [01:53<01:45, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 55%|▌| 53/96 [01:56<01:43, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 56%|▌| 54/96 [01:58<01:39, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 57%|▌| 55/96 [02:00<01:38, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/self/query/MatMul: 42%|▍| 10/24 [00:22<0\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 58%|▌| 56/96 [02:03<01:41, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 59%|▌| 57/96 [02:06<01:38, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 60%|▌| 58/96 [02:08<01:33, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 61%|▌| 59/96 [02:10<01:28, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 62%|▋| 60/96 [02:12<01:21, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 65%|▋| 62/96 [02:16<01:13, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/self/query/MatMul: 71%|▋| 17/24 [00:38<0\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 66%|▋| 63/96 [02:19<01:13, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 67%|▋| 64/96 [02:21<01:10, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 68%|▋| 65/96 [02:23<01:07, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 69%|▋| 66/96 [02:25<01:06, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 70%|▋| 67/96 [02:27<01:02, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 71%|▋| 68/96 [02:29<00:59, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/self/query/MatMul: 100%|█| 24/24 [00:52<0\u001b[A\n", "\n", - "Compressing /encoder/layer.3/output/dense/MatMul: 100% 96/96 [01:23<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 4% 1/24 [00:00<00:18, 1.24it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 8% 2/24 [00:01<00:17, 1.24it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 12% 3/24 [00:02<00:16, 1.26it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 17% 4/24 [00:03<00:15, 1.25it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 21% 5/24 [00:03<00:15, 1.27it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 25% 6/24 [00:04<00:14, 1.26it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 29% 7/24 [00:05<00:13, 1.26it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 33% 8/24 [00:06<00:12, 1.26it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 38% 9/24 [00:07<00:11, 1.27it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 42% 10/24 [00:07<00:10, 1.27it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 46% 11/24 [00:08<00:10, 1.26it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 50% 12/24 [00:09<00:09, 1.21it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 54% 13/24 [00:10<00:09, 1.19it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 58% 14/24 [00:11<00:08, 1.18it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 62% 15/24 [00:12<00:07, 1.18it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 67% 16/24 [00:13<00:07, 1.12it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 71% 17/24 [00:14<00:06, 1.06it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 75% 18/24 [00:15<00:05, 1.01it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 79% 19/24 [00:16<00:05, 1.01s/it]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 83% 20/24 [00:17<00:04, 1.02s/it]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 88% 21/24 [00:18<00:03, 1.03s/it]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 92% 22/24 [00:19<00:01, 1.02it/s]\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 96% 23/24 [00:20<00:00, 1.08it/s]\n", - "Compressing /encoder/layer.4/attention/output/dense/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 72%|▋| 69/96 [02:32<00:58, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 73%|▋| 70/96 [02:34<00:57, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 74%|▋| 71/96 [02:36<00:55, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 75%|▊| 72/96 [02:38<00:51, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 76%|▊| 73/96 [02:40<00:48, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 77%|▊| 74/96 [02:43<00:48, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 78%|▊| 75/96 [02:45<00:47, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 79%|▊| 76/96 [02:48<00:47, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 80%|▊| 77/96 [02:50<00:45, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 81%|▊| 78/96 [02:52<00:42, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 82%|▊| 79/96 [02:54<00:38, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 83%|▊| 80/96 [02:56<00:34, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 84%|▊| 81/96 [02:58<00:31, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 85%|▊| 82/96 [03:01<00:32, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/self/value/MatMul: 58%|▌| 14/24 [00:30<0\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 86%|▊| 83/96 [03:04<00:31, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 89%|▉| 85/96 [03:08<00:24, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 90%|▉| 86/96 [03:10<00:21, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 91%|▉| 87/96 [03:12<00:19, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 92%|▉| 88/96 [03:14<00:17, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 93%|▉| 89/96 [03:17<00:15, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/self/value/MatMul: 88%|▉| 21/24 [00:45<0\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 94%|▉| 90/96 [03:19<00:13, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 95%|▉| 91/96 [03:21<00:11, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/self/value/MatMul: 100%|█| 24/24 [00:52<0\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 96%|▉| 92/96 [03:23<00:09, 2\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 97%|▉| 93/96 [03:25<00:06, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 98%|▉| 94/96 [03:27<00:04, 2\u001b[A\n", + "Compressing /encoder/layer.6/attention/output/dense/MatMul: 8%| | 2/24 [00:04<\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 99%|▉| 95/96 [03:30<00:02, 2\u001b[A\n", + "Compressing /encoder/layer.5/output/dense/MatMul: 100%|█| 96/96 [03:32<00:00, 2\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 0%| | 0/24 [00:00, ?\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 4%| | 1/24 [00:02<00:4\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 8%| | 2/24 [00:04<00:4\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 12%|▏| 3/24 [00:06<00:4\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 17%|▏| 4/24 [00:08<00:4\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 21%|▏| 5/24 [00:10<00:4\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 25%|▎| 6/24 [00:13<00:4\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 29%|▎| 7/24 [00:15<00:3\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 33%|▎| 8/24 [00:17<00:3\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 38%|▍| 9/24 [00:19<00:3\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 42%|▍| 10/24 [00:21<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 46%|▍| 11/24 [00:23<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 50%|▌| 12/24 [00:26<00:\u001b[A\n", + "Compressing /encoder/layer.6/attention/output/dense/MatMul: 71%|▋| 17/24 [00:35\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 54%|▌| 13/24 [00:28<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 58%|▌| 14/24 [00:31<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 62%|▋| 15/24 [00:33<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 67%|▋| 16/24 [00:35<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 71%|▋| 17/24 [00:37<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 75%|▊| 18/24 [00:39<00:\u001b[A\n", + "Compressing /encoder/layer.6/attention/output/dense/MatMul: 100%|█| 24/24 [00:50\u001b[A\n", "\n", - "Compressing /encoder/layer.4/intermediate/dense/MatMul: 100% 24/24 [00:20<00:00, 1.14it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 4% 1/24 [00:00<00:18, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 8% 2/24 [00:01<00:17, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 12% 3/24 [00:02<00:16, 1.29it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 17% 4/24 [00:03<00:15, 1.28it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 21% 5/24 [00:03<00:14, 1.29it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 25% 6/24 [00:04<00:14, 1.29it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 29% 7/24 [00:05<00:13, 1.28it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 33% 8/24 [00:06<00:12, 1.28it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 38% 9/24 [00:07<00:11, 1.28it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 42% 10/24 [00:07<00:11, 1.26it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 46% 11/24 [00:08<00:10, 1.23it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 50% 12/24 [00:09<00:09, 1.21it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 54% 13/24 [00:10<00:09, 1.21it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 58% 14/24 [00:11<00:08, 1.20it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 62% 15/24 [00:12<00:08, 1.11it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 67% 16/24 [00:13<00:07, 1.06it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 71% 17/24 [00:14<00:06, 1.02it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 75% 18/24 [00:15<00:05, 1.01it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 79% 19/24 [00:16<00:05, 1.00s/it]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 83% 20/24 [00:17<00:04, 1.02s/it]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 88% 21/24 [00:18<00:02, 1.01it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 92% 22/24 [00:19<00:01, 1.07it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 96% 23/24 [00:20<00:00, 1.12it/s]\n", - "Compressing /encoder/layer.5/attention/self/key/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 4% 1/24 [00:00<00:17, 1.29it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 8% 2/24 [00:01<00:17, 1.26it/s]\n", - "Compressing /encoder/layer.4/output/dense/MatMul: 28% 27/96 [00:23<00:57, 1.21it/s]\u001b[A\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 12% 3/24 [00:02<00:16, 1.24it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 17% 4/24 [00:03<00:16, 1.24it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 21% 5/24 [00:03<00:15, 1.26it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 29% 7/24 [00:05<00:13, 1.28it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 33% 8/24 [00:06<00:12, 1.28it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 38% 9/24 [00:07<00:11, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 42% 10/24 [00:07<00:11, 1.24it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 46% 11/24 [00:08<00:10, 1.20it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 50% 12/24 [00:09<00:09, 1.20it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 54% 13/24 [00:10<00:09, 1.20it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 58% 14/24 [00:11<00:09, 1.11it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 62% 15/24 [00:12<00:08, 1.06it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 67% 16/24 [00:13<00:07, 1.03it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 71% 17/24 [00:14<00:06, 1.02it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 75% 18/24 [00:15<00:05, 1.01it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 79% 19/24 [00:16<00:05, 1.00s/it]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 83% 20/24 [00:17<00:03, 1.02it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 88% 21/24 [00:18<00:02, 1.08it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 92% 22/24 [00:19<00:01, 1.12it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 96% 23/24 [00:20<00:00, 1.17it/s]\n", - "Compressing /encoder/layer.5/attention/self/query/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 4% 1/24 [00:00<00:18, 1.23it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 8% 2/24 [00:01<00:17, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 12% 3/24 [00:02<00:16, 1.28it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 17% 4/24 [00:03<00:15, 1.26it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 21% 5/24 [00:03<00:14, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 25% 6/24 [00:04<00:14, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 29% 7/24 [00:05<00:13, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 33% 8/24 [00:06<00:12, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 38% 9/24 [00:07<00:12, 1.22it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 42% 10/24 [00:08<00:11, 1.20it/s]\n", - "Compressing /encoder/layer.4/output/dense/MatMul: 61% 59/96 [00:50<00:30, 1.22it/s]\u001b[A\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 46% 11/24 [00:08<00:10, 1.19it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 50% 12/24 [00:09<00:10, 1.17it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 54% 13/24 [00:10<00:09, 1.13it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 58% 14/24 [00:11<00:09, 1.07it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 62% 15/24 [00:12<00:08, 1.03it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 67% 16/24 [00:13<00:07, 1.02it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 71% 17/24 [00:14<00:06, 1.00it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 75% 18/24 [00:15<00:06, 1.01s/it]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 79% 19/24 [00:16<00:05, 1.00s/it]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 83% 20/24 [00:17<00:03, 1.06it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 88% 21/24 [00:18<00:02, 1.11it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 92% 22/24 [00:19<00:01, 1.16it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 96% 23/24 [00:20<00:00, 1.19it/s]\n", - "Compressing /encoder/layer.5/attention/self/value/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 4% 1/24 [00:00<00:18, 1.25it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 8% 2/24 [00:01<00:17, 1.29it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 12% 3/24 [00:02<00:16, 1.29it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 17% 4/24 [00:03<00:15, 1.29it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 21% 5/24 [00:03<00:14, 1.28it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 25% 6/24 [00:04<00:14, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 29% 7/24 [00:05<00:13, 1.27it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 33% 8/24 [00:06<00:12, 1.26it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 38% 9/24 [00:07<00:12, 1.24it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 42% 10/24 [00:07<00:11, 1.22it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 46% 11/24 [00:08<00:10, 1.22it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 50% 12/24 [00:09<00:10, 1.17it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 54% 13/24 [00:10<00:10, 1.10it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 58% 14/24 [00:11<00:09, 1.05it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 62% 15/24 [00:12<00:08, 1.03it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 67% 16/24 [00:13<00:07, 1.02it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 71% 17/24 [00:14<00:06, 1.00it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 75% 18/24 [00:15<00:06, 1.02s/it]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 79% 19/24 [00:16<00:04, 1.05it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 83% 20/24 [00:17<00:03, 1.10it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 88% 21/24 [00:18<00:02, 1.14it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 92% 22/24 [00:19<00:01, 1.18it/s]\n", - "Compressing /encoder/layer.4/output/dense/MatMul: 100% 96/96 [01:23<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 96% 23/24 [00:19<00:00, 1.20it/s]\n", - "Compressing /encoder/layer.5/intermediate/dense/MatMul: 0% 0/24 [00:00, ?it/s]\u001b[A\n", - "Compressing /encoder/layer.5/attention/output/dense/MatMul: 100% 24/24 [00:20<00:00, 1.16it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 0% 0/96 [00:00, ?it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 1% 1/96 [00:00<01:16, 1.24it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 2% 2/96 [00:01<01:14, 1.27it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 3% 3/96 [00:02<01:13, 1.26it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 4% 4/96 [00:03<01:12, 1.27it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 5% 5/96 [00:03<01:12, 1.26it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 6% 6/96 [00:04<01:11, 1.26it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 7% 7/96 [00:05<01:11, 1.25it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 8% 8/96 [00:06<01:10, 1.24it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 9% 9/96 [00:07<01:11, 1.22it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 10% 10/96 [00:08<01:11, 1.21it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 11% 11/96 [00:08<01:12, 1.17it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 12% 12/96 [00:10<01:17, 1.09it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 14% 13/96 [00:11<01:18, 1.05it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 15% 14/96 [00:12<01:19, 1.03it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 16% 15/96 [00:13<01:20, 1.01it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 17% 16/96 [00:14<01:20, 1.00s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 18% 17/96 [00:15<01:20, 1.02s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 19% 18/96 [00:16<01:15, 1.04it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 20% 19/96 [00:16<01:10, 1.09it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 21% 20/96 [00:17<01:05, 1.15it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 22% 21/96 [00:18<01:03, 1.18it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 23% 22/96 [00:19<01:00, 1.21it/s]\n", - "Compressing /encoder/layer.5/intermediate/dense/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 24% 23/96 [00:19<00:59, 1.23it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 25% 24/96 [00:20<00:58, 1.24it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 26% 25/96 [00:21<00:56, 1.25it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 27% 26/96 [00:22<00:56, 1.25it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 28% 27/96 [00:23<00:54, 1.26it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 29% 28/96 [00:23<00:53, 1.26it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 30% 29/96 [00:24<00:52, 1.27it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 31% 30/96 [00:25<00:52, 1.26it/s]\n", - "Compressing /encoder/layer.6/attention/self/key/MatMul: 29% 7/24 [00:05<00:13, 1.25it/s]\u001b[A\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 32% 31/96 [00:26<00:54, 1.20it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 33% 32/96 [00:27<00:53, 1.19it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 34% 33/96 [00:28<00:53, 1.17it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 35% 34/96 [00:29<00:53, 1.17it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 36% 35/96 [00:30<00:55, 1.10it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 38% 36/96 [00:31<00:56, 1.05it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 39% 37/96 [00:32<00:57, 1.03it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 40% 38/96 [00:33<00:57, 1.02it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 41% 39/96 [00:34<00:56, 1.00it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 43% 41/96 [00:36<00:53, 1.04it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 44% 42/96 [00:36<00:49, 1.10it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 45% 43/96 [00:37<00:46, 1.15it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 46% 44/96 [00:38<00:43, 1.19it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 47% 45/96 [00:39<00:42, 1.20it/s]\n", - "Compressing /encoder/layer.6/attention/self/key/MatMul: 92% 22/24 [00:19<00:01, 1.20it/s]\u001b[A\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 48% 46/96 [00:39<00:40, 1.22it/s]\n", - "Compressing /encoder/layer.6/attention/self/key/MatMul: 100% 24/24 [00:20<00:00, 1.16it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 49% 47/96 [00:40<00:39, 1.23it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 50% 48/96 [00:41<00:38, 1.24it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 51% 49/96 [00:42<00:37, 1.26it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 52% 50/96 [00:43<00:36, 1.27it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 53% 51/96 [00:43<00:36, 1.24it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 54% 52/96 [00:44<00:34, 1.26it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 55% 53/96 [00:45<00:34, 1.26it/s]\n", - "Compressing /encoder/layer.6/attention/self/query/MatMul: 25% 6/24 [00:04<00:14, 1.28it/s]\u001b[A\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 56% 54/96 [00:46<00:34, 1.22it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 57% 55/96 [00:47<00:34, 1.21it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 58% 56/96 [00:48<00:33, 1.20it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 59% 57/96 [00:48<00:32, 1.19it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 60% 58/96 [00:50<00:34, 1.11it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 61% 59/96 [00:51<00:35, 1.05it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 62% 60/96 [00:52<00:39, 1.09s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 64% 61/96 [00:54<00:42, 1.22s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 65% 62/96 [00:55<00:46, 1.36s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 66% 63/96 [00:57<00:46, 1.41s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 67% 64/96 [00:58<00:40, 1.28s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 69% 66/96 [00:59<00:31, 1.06s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 70% 67/96 [01:00<00:28, 1.01it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 71% 68/96 [01:01<00:25, 1.08it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 72% 69/96 [01:02<00:23, 1.13it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 73% 70/96 [01:03<00:25, 1.03it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 74% 71/96 [01:04<00:28, 1.12s/it]\n", - "Compressing /encoder/layer.6/attention/self/query/MatMul: 100% 24/24 [00:24<00:00, 1.03s/it]\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 79%|▊| 19/24 [00:42<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 83%|▊| 20/24 [00:44<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 88%|▉| 21/24 [00:47<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 92%|▉| 22/24 [00:49<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 96%|▉| 23/24 [00:51<00:\u001b[A\n", + "Compressing /encoder/layer.6/intermediate/dense/MatMul: 100%|█| 24/24 [00:53<00:\u001b[A\n", "\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 75% 72/96 [01:05<00:24, 1.02s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 76% 73/96 [01:06<00:22, 1.04it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 77% 74/96 [01:07<00:20, 1.07it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 78% 75/96 [01:08<00:19, 1.10it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 79% 76/96 [01:09<00:18, 1.11it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 80% 77/96 [01:10<00:16, 1.13it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 81% 78/96 [01:10<00:15, 1.14it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 82% 79/96 [01:11<00:15, 1.08it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 83% 80/96 [01:12<00:15, 1.03it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 84% 81/96 [01:14<00:14, 1.01it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 85% 82/96 [01:15<00:14, 1.01s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 86% 83/96 [01:16<00:13, 1.02s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 88% 84/96 [01:17<00:12, 1.03s/it]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 89% 85/96 [01:18<00:10, 1.03it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 90% 86/96 [01:18<00:09, 1.08it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 91% 87/96 [01:19<00:07, 1.13it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 92% 88/96 [01:20<00:06, 1.17it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 93% 89/96 [01:21<00:05, 1.19it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 94% 90/96 [01:21<00:04, 1.22it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 95% 91/96 [01:22<00:04, 1.24it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 96% 92/96 [01:23<00:03, 1.24it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 97% 93/96 [01:24<00:02, 1.25it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 98% 94/96 [01:25<00:01, 1.23it/s]\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 99% 95/96 [01:25<00:00, 1.24it/s]\n", - "Compressing /encoder/layer.6/attention/self/value/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 0%| | 0/24 [00:00, ?\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 4%| | 1/24 [00:02<00:5\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 8%| | 2/24 [00:05<00:5\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 9%| | 9/96 [00:17<02:44, 1.\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 12%|▏| 3/24 [00:07<00:5\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 17%|▏| 4/24 [00:10<00:5\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 12%|▏| 12/96 [00:23<02:36, 1\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 21%|▏| 5/24 [00:12<00:4\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 25%|▎| 6/24 [00:15<00:4\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 29%|▎| 7/24 [00:18<00:4\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 17%|▏| 16/96 [00:30<02:31, 1\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 33%|▎| 8/24 [00:20<00:4\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 38%|▍| 9/24 [00:23<00:3\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 20%|▏| 19/96 [00:36<02:26, 1\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 42%|▍| 10/24 [00:26<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 46%|▍| 11/24 [00:28<00:\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 23%|▏| 22/96 [00:42<02:18, 1\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 50%|▌| 12/24 [00:31<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 54%|▌| 13/24 [00:34<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 58%|▌| 14/24 [00:36<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 62%|▋| 15/24 [00:38<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 67%|▋| 16/24 [00:40<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 75%|▊| 18/24 [00:44<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 79%|▊| 19/24 [00:46<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 83%|▊| 20/24 [00:48<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 88%|▉| 21/24 [00:50<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 92%|▉| 22/24 [00:52<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 96%|▉| 23/24 [00:54<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/key/MatMul: 100%|█| 24/24 [00:55<00:\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 4%| | 1/24 [00:01<00\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 8%| | 2/24 [00:03<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 12%|▏| 3/24 [00:05<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 17%|▏| 4/24 [00:07<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 21%|▏| 5/24 [00:09<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 25%|▎| 6/24 [00:11<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 29%|▎| 7/24 [00:14<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 33%|▎| 8/24 [00:16<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 38%|▍| 9/24 [00:18<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 42%|▍| 10/24 [00:20<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 46%|▍| 11/24 [00:22<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 50%|▌| 12/24 [00:25<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 54%|▌| 13/24 [00:27<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 58%|▌| 14/24 [00:29<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 62%|▋| 15/24 [00:31<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 67%|▋| 16/24 [00:34<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 71%|▋| 17/24 [00:36<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 79%|▊| 19/24 [00:40<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 83%|▊| 20/24 [00:42<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 88%|▉| 21/24 [00:45<0\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 56%|▌| 54/96 [01:53<01:35, 2\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 92%|▉| 22/24 [00:47<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 96%|▉| 23/24 [00:49<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/query/MatMul: 100%|█| 24/24 [00:51<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 0%| | 0/24 [00:00,\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 4%| | 1/24 [00:02<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 12%|▏| 3/24 [00:06<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 17%|▏| 4/24 [00:08<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 21%|▏| 5/24 [00:10<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 25%|▎| 6/24 [00:12<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 29%|▎| 7/24 [00:14<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 33%|▎| 8/24 [00:16<00\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 38%|▍| 9/24 [00:18<00\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 69%|▋| 66/96 [02:20<01:03, 2\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 42%|▍| 10/24 [00:21<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 50%|▌| 12/24 [00:25<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 54%|▌| 13/24 [00:27<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 58%|▌| 14/24 [00:29<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 62%|▋| 15/24 [00:31<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 67%|▋| 16/24 [00:34<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 71%|▋| 17/24 [00:35<0\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 77%|▊| 74/96 [02:37<00:46, 2\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 75%|▊| 18/24 [00:38<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 83%|▊| 20/24 [00:42<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 88%|▉| 21/24 [00:44<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 92%|▉| 22/24 [00:46<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 96%|▉| 23/24 [00:49<0\u001b[A\n", + "Compressing /encoder/layer.7/attention/self/value/MatMul: 100%|█| 24/24 [00:51<0\u001b[A\n", "\n", - "Compressing /encoder/layer.5/output/dense/MatMul: 100% 96/96 [01:26<00:00, 1.11it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 4% 1/24 [00:00<00:19, 1.17it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 8% 2/24 [00:01<00:19, 1.16it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 12% 3/24 [00:02<00:18, 1.15it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 17% 4/24 [00:03<00:17, 1.15it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 21% 5/24 [00:04<00:16, 1.15it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 25% 6/24 [00:05<00:16, 1.07it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 29% 7/24 [00:06<00:16, 1.03it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 33% 8/24 [00:07<00:15, 1.01it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 38% 9/24 [00:08<00:15, 1.01s/it]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 42% 10/24 [00:09<00:14, 1.03s/it]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 46% 11/24 [00:10<00:13, 1.06s/it]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 50% 12/24 [00:11<00:12, 1.06s/it]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 54% 13/24 [00:12<00:10, 1.02it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 58% 14/24 [00:13<00:09, 1.08it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 62% 15/24 [00:14<00:07, 1.13it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 67% 16/24 [00:14<00:06, 1.16it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 71% 17/24 [00:15<00:05, 1.19it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 75% 18/24 [00:16<00:04, 1.21it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 79% 19/24 [00:17<00:04, 1.21it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 83% 20/24 [00:18<00:03, 1.21it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 88% 21/24 [00:19<00:02, 1.22it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 92% 22/24 [00:19<00:01, 1.23it/s]\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 96% 23/24 [00:20<00:00, 1.25it/s]\n", - "Compressing /encoder/layer.6/attention/output/dense/MatMul: 100% 24/24 [00:21<00:00, 1.12it/s]\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 0%| | 0/24 [00:00<\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 4%| | 1/24 [00:02<\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 8%| | 2/24 [00:04<\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 12%|▏| 3/24 [00:06<\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 17%|▏| 4/24 [00:08<\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 25%|▎| 6/24 [00:12<\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 91%|▉| 87/96 [03:05<00:20, 2\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 29%|▎| 7/24 [00:15<\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 33%|▎| 8/24 [00:17<\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 38%|▍| 9/24 [00:19<\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 42%|▍| 10/24 [00:21\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 50%|▌| 12/24 [00:25\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 54%|▌| 13/24 [00:27\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 58%|▌| 14/24 [00:29\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 62%|▋| 15/24 [00:31\u001b[A\n", + "Compressing /encoder/layer.6/output/dense/MatMul: 100%|█| 96/96 [03:24<00:00, 2\u001b[A\n", "\n", - "Compressing /encoder/layer.6/intermediate/dense/MatMul: 100% 24/24 [00:21<00:00, 1.12it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 4% 1/24 [00:00<00:19, 1.20it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 8% 2/24 [00:01<00:18, 1.17it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 12% 3/24 [00:02<00:18, 1.16it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 17% 4/24 [00:03<00:17, 1.17it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 21% 5/24 [00:04<00:16, 1.12it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 25% 6/24 [00:05<00:16, 1.06it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 29% 7/24 [00:06<00:16, 1.03it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 33% 8/24 [00:07<00:15, 1.00it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 38% 9/24 [00:08<00:15, 1.01s/it]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 42% 10/24 [00:09<00:14, 1.03s/it]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 46% 11/24 [00:10<00:13, 1.03s/it]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 50% 12/24 [00:11<00:11, 1.04it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 54% 13/24 [00:12<00:10, 1.09it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 58% 14/24 [00:13<00:08, 1.15it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 62% 15/24 [00:13<00:07, 1.19it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 67% 16/24 [00:14<00:06, 1.22it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 71% 17/24 [00:15<00:05, 1.25it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 75% 18/24 [00:16<00:04, 1.27it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 79% 19/24 [00:16<00:03, 1.28it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 83% 20/24 [00:17<00:03, 1.27it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 88% 21/24 [00:18<00:02, 1.27it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 92% 22/24 [00:19<00:01, 1.27it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 96% 23/24 [00:19<00:00, 1.28it/s]\n", - "Compressing /encoder/layer.7/attention/self/key/MatMul: 100% 24/24 [00:20<00:00, 1.16it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 4% 1/24 [00:00<00:21, 1.08it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 8% 2/24 [00:01<00:19, 1.13it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 12% 3/24 [00:02<00:18, 1.15it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 17% 4/24 [00:03<00:18, 1.11it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 21% 5/24 [00:04<00:17, 1.06it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 25% 6/24 [00:05<00:17, 1.02it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 29% 7/24 [00:07<00:19, 1.15s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 33% 8/24 [00:08<00:21, 1.32s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 38% 9/24 [00:10<00:20, 1.36s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 42% 10/24 [00:11<00:19, 1.36s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 46% 11/24 [00:12<00:15, 1.21s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 50% 12/24 [00:13<00:13, 1.12s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 54% 13/24 [00:14<00:11, 1.05s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 58% 14/24 [00:15<00:10, 1.03s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 62% 15/24 [00:16<00:09, 1.03s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 67% 16/24 [00:17<00:07, 1.04it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 71% 17/24 [00:18<00:07, 1.01s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 75% 18/24 [00:19<00:06, 1.01s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 79% 19/24 [00:20<00:04, 1.03it/s]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 83% 20/24 [00:21<00:04, 1.00s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 88% 21/24 [00:22<00:03, 1.04s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 92% 22/24 [00:23<00:02, 1.10s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 96% 23/24 [00:24<00:01, 1.10s/it]\n", - "Compressing /encoder/layer.7/attention/self/query/MatMul: 100% 24/24 [00:26<00:00, 1.09s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 4% 1/24 [00:01<00:29, 1.27s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 8% 2/24 [00:02<00:28, 1.30s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 12% 3/24 [00:04<00:30, 1.43s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 17% 4/24 [00:05<00:29, 1.47s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 21% 5/24 [00:07<00:26, 1.42s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 25% 6/24 [00:08<00:25, 1.43s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 29% 7/24 [00:09<00:22, 1.34s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 33% 8/24 [00:10<00:20, 1.26s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 38% 9/24 [00:11<00:18, 1.26s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 42% 10/24 [00:13<00:17, 1.26s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 46% 11/24 [00:14<00:15, 1.20s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 50% 12/24 [00:15<00:14, 1.25s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 54% 13/24 [00:16<00:13, 1.19s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 58% 14/24 [00:17<00:11, 1.15s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 62% 15/24 [00:19<00:10, 1.19s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 67% 16/24 [00:20<00:10, 1.30s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 71% 17/24 [00:21<00:09, 1.32s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 75% 18/24 [00:23<00:08, 1.42s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 79% 19/24 [00:25<00:07, 1.49s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 83% 20/24 [00:26<00:05, 1.48s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 88% 21/24 [00:28<00:04, 1.41s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 92% 22/24 [00:29<00:02, 1.38s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 96% 23/24 [00:30<00:01, 1.38s/it]\n", - "Compressing /encoder/layer.7/attention/self/value/MatMul: 100% 24/24 [00:31<00:00, 1.33s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 4% 1/24 [00:00<00:21, 1.06it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 8% 2/24 [00:01<00:20, 1.07it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 12% 3/24 [00:02<00:20, 1.03it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 17% 4/24 [00:03<00:20, 1.01s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 21% 5/24 [00:04<00:19, 1.02s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 25% 6/24 [00:05<00:17, 1.02it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 29% 7/24 [00:06<00:15, 1.09it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 33% 8/24 [00:07<00:14, 1.08it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 38% 9/24 [00:08<00:13, 1.10it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 42% 10/24 [00:09<00:12, 1.09it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 46% 11/24 [00:10<00:12, 1.02it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 50% 12/24 [00:11<00:12, 1.02s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 54% 13/24 [00:12<00:11, 1.06s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 58% 14/24 [00:14<00:12, 1.21s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 62% 15/24 [00:15<00:10, 1.18s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 67% 16/24 [00:16<00:09, 1.14s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 71% 17/24 [00:17<00:07, 1.10s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 75% 18/24 [00:18<00:06, 1.07s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 79% 19/24 [00:19<00:05, 1.05s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 83% 20/24 [00:20<00:04, 1.04s/it]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 88% 21/24 [00:21<00:02, 1.03it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 92% 22/24 [00:22<00:01, 1.10it/s]\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 96% 23/24 [00:22<00:00, 1.12it/s]\n", - "Compressing /encoder/layer.6/output/dense/MatMul: 100% 96/96 [01:42<00:00, 1.07s/it]\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 67%|▋| 16/24 [00:34\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 71%|▋| 17/24 [00:36\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 75%|▊| 18/24 [00:38\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 83%|▊| 20/24 [00:41\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 88%|▉| 21/24 [00:44\u001b[A\n", + "Compressing /encoder/layer.7/intermediate/dense/MatMul: 21%|▏| 5/24 [00:11<00:4\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 96%|▉| 23/24 [00:48\u001b[A\n", + "Compressing /encoder/layer.7/attention/output/dense/MatMul: 100%|█| 24/24 [00:50\u001b[A\n", "\n", - "Compressing /encoder/layer.7/attention/output/dense/MatMul: 100% 24/24 [00:23<00:00, 1.01it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 0% 0/96 [00:00, ?it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 1% 1/96 [00:00<01:11, 1.32it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 2% 2/96 [00:01<01:13, 1.27it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 3% 3/96 [00:02<01:11, 1.30it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 4% 4/96 [00:03<01:11, 1.28it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 5% 5/96 [00:03<01:12, 1.25it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 6% 6/96 [00:04<01:10, 1.27it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 7% 7/96 [00:05<01:09, 1.28it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 8% 8/96 [00:06<01:08, 1.29it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 9% 9/96 [00:07<01:09, 1.25it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 10% 10/96 [00:07<01:09, 1.24it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 11% 11/96 [00:08<01:09, 1.23it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 12% 12/96 [00:09<01:08, 1.23it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 14% 13/96 [00:10<01:09, 1.19it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 15% 14/96 [00:11<01:13, 1.11it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 16% 15/96 [00:12<01:17, 1.05it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 17% 16/96 [00:13<01:17, 1.03it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 18% 17/96 [00:14<01:19, 1.00s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 19% 18/96 [00:15<01:18, 1.01s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 20% 19/96 [00:16<01:17, 1.01s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 21% 20/96 [00:17<01:11, 1.06it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 22% 21/96 [00:18<01:08, 1.10it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 23% 22/96 [00:19<01:04, 1.14it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 24% 23/96 [00:19<01:01, 1.18it/s]\n", - "Compressing /encoder/layer.7/intermediate/dense/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 1%| | 1/96 [00:01<02:51, 1.\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 2%| | 2/96 [00:03<02:52, 1.\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 3%| | 3/96 [00:05<03:03, 1.\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 4%| | 4/96 [00:07<03:03, 2.\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 5%| | 5/96 [00:09<02:58, 1.\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 6%| | 6/96 [00:11<03:02, 2.\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 7%| | 7/96 [00:14<03:07, 2.\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 9%| | 9/96 [00:18<03:01, 2.\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 10%| | 10/96 [00:20<02:57, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 11%| | 11/96 [00:22<02:57, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 12%|▏| 12/96 [00:24<02:56, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 14%|▏| 13/96 [00:26<02:51, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 15%|▏| 14/96 [00:28<02:46, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 16%|▏| 15/96 [00:30<02:43, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 17%|▏| 16/96 [00:32<02:40, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 18%|▏| 17/96 [00:34<02:41, 2\u001b[A\n", + "Compressing /encoder/layer.7/intermediate/dense/MatMul: 100%|█| 24/24 [00:54<00:\u001b[A\n", "\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 25% 24/96 [00:20<01:00, 1.19it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 26% 25/96 [00:21<00:58, 1.21it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 27% 26/96 [00:22<00:57, 1.23it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 28% 27/96 [00:23<00:55, 1.25it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 29% 28/96 [00:23<00:54, 1.25it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 30% 29/96 [00:24<00:53, 1.26it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 31% 30/96 [00:25<00:52, 1.27it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 32% 31/96 [00:26<00:50, 1.28it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 34% 33/96 [00:27<00:51, 1.23it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 35% 34/96 [00:28<00:52, 1.19it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 36% 35/96 [00:30<01:05, 1.07s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 38% 36/96 [00:32<01:16, 1.28s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 39% 37/96 [00:33<01:15, 1.29s/it]\n", - "Compressing /encoder/layer.8/attention/self/key/MatMul: 54% 13/24 [00:12<00:14, 1.29s/it]\u001b[A\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 40% 38/96 [00:34<01:14, 1.28s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 41% 39/96 [00:36<01:14, 1.30s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 42% 40/96 [00:37<01:13, 1.32s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 44% 42/96 [00:39<01:07, 1.26s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 45% 43/96 [00:41<01:05, 1.23s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 46% 44/96 [00:42<01:00, 1.16s/it]\n", - "Compressing /encoder/layer.8/attention/self/key/MatMul: 83% 20/24 [00:21<00:04, 1.16s/it]\u001b[A\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 47% 45/96 [00:43<00:57, 1.12s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 48% 46/96 [00:44<00:53, 1.08s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 49% 47/96 [00:44<00:48, 1.01it/s]\n", - "Compressing /encoder/layer.8/attention/self/key/MatMul: 100% 24/24 [00:24<00:00, 1.04s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 50% 48/96 [00:45<00:44, 1.08it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 51% 49/96 [00:46<00:41, 1.13it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 52% 50/96 [00:47<00:39, 1.17it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 53% 51/96 [00:47<00:37, 1.20it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 54% 52/96 [00:48<00:35, 1.22it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 55% 53/96 [00:49<00:35, 1.23it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 56% 54/96 [00:50<00:34, 1.22it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 57% 55/96 [00:51<00:34, 1.21it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 58% 56/96 [00:52<00:33, 1.19it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 59% 57/96 [00:53<00:33, 1.15it/s]\n", - "Compressing /encoder/layer.8/attention/self/query/MatMul: 38% 9/24 [00:07<00:12, 1.16it/s]\u001b[A\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 60% 58/96 [00:54<00:34, 1.09it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 62% 60/96 [00:56<00:38, 1.08s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 64% 61/96 [00:57<00:39, 1.12s/it]\n", - "Compressing /encoder/layer.8/attention/self/query/MatMul: 54% 13/24 [00:12<00:12, 1.13s/it]\u001b[A\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 65% 62/96 [00:59<00:39, 1.17s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 67% 64/96 [01:01<00:36, 1.13s/it]\n", - "Compressing /encoder/layer.8/attention/self/query/MatMul: 67% 16/24 [00:15<00:09, 1.13s/it]\u001b[A\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 68% 65/96 [01:02<00:33, 1.09s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 69% 66/96 [01:03<00:34, 1.13s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 70% 67/96 [01:04<00:32, 1.11s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 71% 68/96 [01:05<00:29, 1.04s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 72% 69/96 [01:06<00:27, 1.03s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 74% 71/96 [01:08<00:23, 1.06it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 75% 72/96 [01:09<00:23, 1.03it/s]\n", - "Compressing /encoder/layer.8/attention/self/query/MatMul: 100% 24/24 [00:23<00:00, 1.01it/s]\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 19%|▏| 18/96 [00:37<02:48, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 21%|▏| 20/96 [00:40<02:33, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 22%|▏| 21/96 [00:42<02:28, 1\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 23%|▏| 22/96 [00:44<02:27, 1\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 24%|▏| 23/96 [00:46<02:26, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 25%|▎| 24/96 [00:48<02:26, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 26%|▎| 25/96 [00:51<02:27, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 27%|▎| 26/96 [00:53<02:25, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 29%|▎| 28/96 [00:57<02:18, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 30%|▎| 29/96 [00:59<02:14, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 31%|▎| 30/96 [01:01<02:13, 2\u001b[A\n", + "Compressing /encoder/layer.8/attention/self/key/MatMul: 46%|▍| 11/24 [00:24<00:\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 32%|▎| 31/96 [01:03<02:14, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 33%|▎| 32/96 [01:05<02:11, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 35%|▎| 34/96 [01:09<02:01, 1\u001b[A\n", + "Compressing /encoder/layer.8/attention/self/key/MatMul: 62%|▋| 15/24 [00:32<00:\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 38%|▍| 36/96 [01:12<01:55, 1\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 39%|▍| 37/96 [01:14<01:52, 1\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 40%|▍| 38/96 [01:16<01:49, 1\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 41%|▍| 39/96 [01:18<01:46, 1\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 43%|▍| 41/96 [01:22<01:41, 1\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 44%|▍| 42/96 [01:23<01:40, 1\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 45%|▍| 43/96 [01:25<01:40, 1\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 46%|▍| 44/96 [01:28<01:42, 1\u001b[A\n", + "Compressing /encoder/layer.8/attention/self/key/MatMul: 100%|█| 24/24 [00:52<00:\u001b[A\n", "\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 76% 73/96 [01:10<00:21, 1.06it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 77% 74/96 [01:11<00:22, 1.01s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 78% 75/96 [01:12<00:21, 1.04s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 79% 76/96 [01:13<00:21, 1.09s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 80% 77/96 [01:14<00:21, 1.12s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 81% 78/96 [01:16<00:21, 1.17s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 82% 79/96 [01:17<00:20, 1.19s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 83% 80/96 [01:18<00:19, 1.20s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 84% 81/96 [01:19<00:18, 1.21s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 85% 82/96 [01:21<00:17, 1.27s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 86% 83/96 [01:22<00:16, 1.29s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 88% 84/96 [01:23<00:14, 1.19s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 89% 85/96 [01:24<00:11, 1.08s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 90% 86/96 [01:25<00:10, 1.03s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 91% 87/96 [01:26<00:09, 1.00s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 92% 88/96 [01:27<00:08, 1.02s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 93% 89/96 [01:28<00:07, 1.01s/it]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 94% 90/96 [01:29<00:05, 1.00it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 95% 91/96 [01:29<00:04, 1.06it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 96% 92/96 [01:30<00:03, 1.11it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 97% 93/96 [01:31<00:02, 1.16it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 98% 94/96 [01:32<00:01, 1.21it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 99% 95/96 [01:33<00:00, 1.17it/s]\n", - "Compressing /encoder/layer.7/output/dense/MatMul: 100% 96/96 [01:34<00:00, 1.02it/s]\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 47%|▍| 45/96 [01:30<01:44, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 48%|▍| 46/96 [01:32<01:40, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 49%|▍| 47/96 [01:34<01:40, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 50%|▌| 48/96 [01:36<01:40, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 51%|▌| 49/96 [01:38<01:35, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 52%|▌| 50/96 [01:40<01:33, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 53%|▌| 51/96 [01:42<01:36, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 54%|▌| 52/96 [01:45<01:35, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 55%|▌| 53/96 [01:47<01:35, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 56%|▌| 54/96 [01:49<01:32, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 57%|▌| 55/96 [01:51<01:28, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 58%|▌| 56/96 [01:54<01:30, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 59%|▌| 57/96 [01:56<01:28, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 60%|▌| 58/96 [01:58<01:27, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 61%|▌| 59/96 [02:01<01:27, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 62%|▋| 60/96 [02:03<01:24, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 64%|▋| 61/96 [02:05<01:20, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 65%|▋| 62/96 [02:07<01:13, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 66%|▋| 63/96 [02:09<01:09, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 67%|▋| 64/96 [02:11<01:08, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 68%|▋| 65/96 [02:14<01:10, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 69%|▋| 66/96 [02:16<01:05, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 70%|▋| 67/96 [02:18<01:01, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 71%|▋| 68/96 [02:20<00:58, 2\u001b[A\n", + "Compressing /encoder/layer.8/attention/self/query/MatMul: 100%|█| 24/24 [00:52<0\u001b[A\n", "\n", - "Compressing /encoder/layer.8/attention/self/value/MatMul: 100% 24/24 [00:24<00:00, 1.03s/it]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 4% 1/24 [00:00<00:19, 1.20it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 8% 2/24 [00:01<00:18, 1.21it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 12% 3/24 [00:02<00:18, 1.14it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 17% 4/24 [00:03<00:18, 1.06it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 21% 5/24 [00:04<00:18, 1.03it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 25% 6/24 [00:05<00:17, 1.02it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 29% 7/24 [00:06<00:16, 1.00it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 33% 8/24 [00:07<00:16, 1.00s/it]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 38% 9/24 [00:08<00:15, 1.00s/it]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 42% 10/24 [00:09<00:13, 1.07it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 46% 11/24 [00:10<00:11, 1.13it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 50% 12/24 [00:11<00:10, 1.17it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 54% 13/24 [00:11<00:09, 1.21it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 58% 14/24 [00:12<00:08, 1.23it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 62% 15/24 [00:13<00:07, 1.25it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 67% 16/24 [00:14<00:06, 1.26it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 71% 17/24 [00:15<00:06, 1.12it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 75% 18/24 [00:16<00:05, 1.02it/s]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 79% 19/24 [00:17<00:05, 1.06s/it]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 83% 20/24 [00:18<00:04, 1.06s/it]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 88% 21/24 [00:19<00:03, 1.09s/it]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 92% 22/24 [00:21<00:02, 1.11s/it]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 96% 23/24 [00:22<00:01, 1.19s/it]\n", - "Compressing /encoder/layer.8/attention/output/dense/MatMul: 100% 24/24 [00:23<00:00, 1.01it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 0% 0/96 [00:00, ?it/s]\n", - "Compressing /encoder/layer.8/intermediate/dense/MatMul: 100% 24/24 [00:24<00:00, 1.02s/it]\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 72%|▋| 69/96 [02:22<00:56, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 73%|▋| 70/96 [02:24<00:54, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 74%|▋| 71/96 [02:26<00:52, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 75%|▊| 72/96 [02:28<00:51, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 76%|▊| 73/96 [02:31<00:50, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 77%|▊| 74/96 [02:33<00:48, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 78%|▊| 75/96 [02:35<00:45, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 79%|▊| 76/96 [02:37<00:42, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 80%|▊| 77/96 [02:39<00:41, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 81%|▊| 78/96 [02:42<00:40, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 82%|▊| 79/96 [02:44<00:38, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 83%|▊| 80/96 [02:46<00:34, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 84%|▊| 81/96 [02:48<00:31, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 85%|▊| 82/96 [02:50<00:28, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 86%|▊| 83/96 [02:52<00:27, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 88%|▉| 84/96 [02:54<00:25, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 89%|▉| 85/96 [02:57<00:24, 2\u001b[A\n", + "Compressing /encoder/layer.8/attention/self/value/MatMul: 71%|▋| 17/24 [00:35<0\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 90%|▉| 86/96 [02:59<00:22, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 92%|▉| 88/96 [03:03<00:17, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 93%|▉| 89/96 [03:05<00:14, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 94%|▉| 90/96 [03:07<00:12, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 95%|▉| 91/96 [03:10<00:11, 2\u001b[A\n", + "Compressing /encoder/layer.8/attention/self/value/MatMul: 96%|▉| 23/24 [00:48<0\u001b[A\n", + "Compressing /encoder/layer.8/attention/self/value/MatMul: 100%|█| 24/24 [00:50<0\u001b[A\n", "\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 1% 1/96 [00:01<02:17, 1.45s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 2% 2/96 [00:02<02:04, 1.33s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 3% 3/96 [00:04<02:07, 1.37s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 4% 4/96 [00:05<02:06, 1.37s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 5% 5/96 [00:06<02:00, 1.32s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 6% 6/96 [00:07<01:55, 1.28s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 7% 7/96 [00:08<01:42, 1.15s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 8% 8/96 [00:09<01:41, 1.15s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 9% 9/96 [00:11<01:38, 1.13s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 10% 10/96 [00:11<01:27, 1.02s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 11% 11/96 [00:12<01:20, 1.06it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 12% 12/96 [00:13<01:14, 1.13it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 14% 13/96 [00:14<01:11, 1.15it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 15% 14/96 [00:14<01:08, 1.19it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 16% 15/96 [00:15<01:07, 1.19it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 17% 16/96 [00:16<01:05, 1.22it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 18% 17/96 [00:17<01:03, 1.24it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 19% 18/96 [00:18<01:03, 1.22it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 20% 19/96 [00:19<01:03, 1.21it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 21% 20/96 [00:19<01:03, 1.20it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 22% 21/96 [00:20<01:02, 1.20it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 23% 22/96 [00:21<01:06, 1.12it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 24% 23/96 [00:22<01:08, 1.06it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 25% 24/96 [00:23<01:10, 1.03it/s]\n", - "Compressing /encoder/layer.9/attention/self/key/MatMul: 100% 24/24 [00:23<00:00, 1.03it/s]\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 96%|▉| 92/96 [03:12<00:09, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 97%|▉| 93/96 [03:15<00:06, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 99%|▉| 95/96 [03:19<00:02, 2\u001b[A\n", + "Compressing /encoder/layer.7/output/dense/MatMul: 100%|█| 96/96 [03:21<00:00, 2\u001b[A\n", "\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 26% 25/96 [00:24<01:10, 1.01it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 27% 26/96 [00:25<01:10, 1.01s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 28% 27/96 [00:26<01:09, 1.01s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 29% 28/96 [00:27<01:06, 1.02it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 30% 29/96 [00:28<01:01, 1.09it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 31% 30/96 [00:29<00:58, 1.13it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 32% 31/96 [00:30<00:55, 1.17it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 33% 32/96 [00:31<00:53, 1.20it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 34% 33/96 [00:31<00:51, 1.22it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 35% 34/96 [00:32<00:50, 1.23it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 36% 35/96 [00:33<00:49, 1.24it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 38% 36/96 [00:34<00:47, 1.25it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 39% 37/96 [00:34<00:46, 1.26it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 40% 38/96 [00:35<00:45, 1.27it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 41% 39/96 [00:36<00:44, 1.29it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 42% 40/96 [00:37<00:43, 1.29it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 43% 41/96 [00:38<00:44, 1.24it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 44% 42/96 [00:38<00:43, 1.24it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 45% 43/96 [00:39<00:43, 1.22it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 46% 44/96 [00:40<00:42, 1.22it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 47% 45/96 [00:41<00:44, 1.15it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 48% 46/96 [00:42<00:45, 1.09it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 49% 47/96 [00:43<00:46, 1.06it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 50% 48/96 [00:44<00:46, 1.03it/s]\n", - "Compressing /encoder/layer.9/attention/self/query/MatMul: 100% 24/24 [00:20<00:00, 1.17it/s]\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 0%| | 0/24 [00:00, ?\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 4%| | 1/24 [00:02<00:5\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 8%| | 2/24 [00:04<00:5\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 12%|▏| 3/24 [00:07<00:5\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 17%|▏| 4/24 [00:09<00:4\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 21%|▏| 5/24 [00:11<00:4\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 25%|▎| 6/24 [00:14<00:4\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 29%|▎| 7/24 [00:16<00:4\u001b[A\n", + "Compressing /encoder/layer.8/attention/output/dense/MatMul: 50%|▌| 12/24 [00:25\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 33%|▎| 8/24 [00:19<00:3\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 42%|▍| 10/24 [00:23<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 46%|▍| 11/24 [00:25<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 50%|▌| 12/24 [00:27<00:\u001b[A\n", + "Compressing /encoder/layer.8/attention/output/dense/MatMul: 71%|▋| 17/24 [00:36\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 54%|▌| 13/24 [00:29<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 58%|▌| 14/24 [00:32<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 62%|▋| 15/24 [00:34<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 67%|▋| 16/24 [00:37<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 71%|▋| 17/24 [00:39<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 75%|▊| 18/24 [00:40<00:\u001b[A\n", + "Compressing /encoder/layer.8/attention/output/dense/MatMul: 100%|█| 24/24 [00:51\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 79%|▊| 19/24 [00:42<00:\n", + "Compressing /encoder/layer.8/output/dense/MatMul: 0%| | 0/96 [00:00, ?it/s]\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 83%|▊| 20/24 [00:45<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 88%|▉| 21/24 [00:48<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 92%|▉| 22/24 [00:50<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 96%|▉| 23/24 [00:52<00:\u001b[A\n", + "Compressing /encoder/layer.8/intermediate/dense/MatMul: 100%|█| 24/24 [00:54<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 0%| | 0/24 [00:00, ?\n", + "Compressing /encoder/layer.8/output/dense/MatMul: 6%| | 6/96 [00:12<03:19, 2.\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 4%| | 1/24 [00:02<00:4\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 8%| | 2/24 [00:04<00:4\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 12%|▏| 3/24 [00:06<00:4\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 17%|▏| 4/24 [00:08<00:4\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 21%|▏| 5/24 [00:11<00:4\u001b[A\n", + "Compressing /encoder/layer.8/output/dense/MatMul: 12%|▏| 12/96 [00:23<02:37, 1\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 25%|▎| 6/24 [00:13<00:4\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 29%|▎| 7/24 [00:15<00:3\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 33%|▎| 8/24 [00:17<00:3\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 38%|▍| 9/24 [00:19<00:3\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 42%|▍| 10/24 [00:21<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 46%|▍| 11/24 [00:24<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 50%|▌| 12/24 [00:26<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 54%|▌| 13/24 [00:28<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 58%|▌| 14/24 [00:30<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 62%|▋| 15/24 [00:32<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 67%|▋| 16/24 [00:35<00:\u001b[A\n", + "Compressing /encoder/layer.8/output/dense/MatMul: 25%|▎| 24/96 [00:47<02:22, 1\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 71%|▋| 17/24 [00:37<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 75%|▊| 18/24 [00:40<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 83%|▊| 20/24 [00:44<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 88%|▉| 21/24 [00:45<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 92%|▉| 22/24 [00:47<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 96%|▉| 23/24 [00:49<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/key/MatMul: 100%|█| 24/24 [00:52<00:\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 0%| | 0/24 [00:00,\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 4%| | 1/24 [00:01<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 8%| | 2/24 [00:03<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 12%|▏| 3/24 [00:05<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 17%|▏| 4/24 [00:07<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 21%|▏| 5/24 [00:09<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 25%|▎| 6/24 [00:11<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 29%|▎| 7/24 [00:13<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 38%|▍| 9/24 [00:17<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 42%|▍| 10/24 [00:19<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 46%|▍| 11/24 [00:21<0\u001b[A\n", + "Compressing /encoder/layer.8/output/dense/MatMul: 44%|▍| 42/96 [01:26<01:52, 2\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 50%|▌| 12/24 [00:24<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 58%|▌| 14/24 [00:28<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 62%|▋| 15/24 [00:30<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 67%|▋| 16/24 [00:32<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 71%|▋| 17/24 [00:34<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 75%|▊| 18/24 [00:36<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 79%|▊| 19/24 [00:39<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 83%|▊| 20/24 [00:41<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 88%|▉| 21/24 [00:43<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 92%|▉| 22/24 [00:46<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 96%|▉| 23/24 [00:48<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/query/MatMul: 100%|█| 24/24 [00:50<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 0%| | 0/24 [00:00,\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 4%| | 1/24 [00:02<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 8%| | 2/24 [00:04<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 12%|▏| 3/24 [00:07<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 17%|▏| 4/24 [00:09<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 21%|▏| 5/24 [00:11<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 25%|▎| 6/24 [00:13<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 29%|▎| 7/24 [00:15<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 33%|▎| 8/24 [00:17<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 38%|▍| 9/24 [00:19<00\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 42%|▍| 10/24 [00:21<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 46%|▍| 11/24 [00:24<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 50%|▌| 12/24 [00:26<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 54%|▌| 13/24 [00:28<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 58%|▌| 14/24 [00:30<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 62%|▋| 15/24 [00:32<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 67%|▋| 16/24 [00:35<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 71%|▋| 17/24 [00:37<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 75%|▊| 18/24 [00:39<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 79%|▊| 19/24 [00:41<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 83%|▊| 20/24 [00:43<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 88%|▉| 21/24 [00:46<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 92%|▉| 22/24 [00:48<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 96%|▉| 23/24 [00:51<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/self/value/MatMul: 100%|█| 24/24 [00:53<0\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 0%| | 0/24 [00:00<\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 4%| | 1/24 [00:02<\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 8%| | 2/24 [00:03<\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 12%|▏| 3/24 [00:05<\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 17%|▏| 4/24 [00:08<\u001b[A\n", + "Compressing /encoder/layer.8/output/dense/MatMul: 86%|▊| 83/96 [02:57<00:27, 2\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 21%|▏| 5/24 [00:11<\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 25%|▎| 6/24 [00:13<\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 29%|▎| 7/24 [00:15<\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 38%|▍| 9/24 [00:19<\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 42%|▍| 10/24 [00:21\u001b[A\n", + "Compressing /encoder/layer.8/output/dense/MatMul: 93%|▉| 89/96 [03:10<00:15, 2\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 46%|▍| 11/24 [00:23\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 50%|▌| 12/24 [00:26\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 54%|▌| 13/24 [00:28\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 58%|▌| 14/24 [00:31\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 62%|▋| 15/24 [00:33\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 67%|▋| 16/24 [00:35\u001b[A\n", + "Compressing /encoder/layer.8/output/dense/MatMul: 100%|█| 96/96 [03:24<00:00, 2\u001b[A\n", "\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 51% 49/96 [00:45<00:46, 1.02it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 52% 50/96 [00:46<00:46, 1.00s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 53% 51/96 [00:47<00:45, 1.02s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 54% 52/96 [00:48<00:41, 1.05it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 55% 53/96 [00:49<00:38, 1.10it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 56% 54/96 [00:50<00:36, 1.14it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 57% 55/96 [00:50<00:35, 1.17it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 58% 56/96 [00:51<00:33, 1.19it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 59% 57/96 [00:52<00:33, 1.15it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 60% 58/96 [00:53<00:31, 1.20it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 61% 59/96 [00:54<00:30, 1.20it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 62% 60/96 [00:55<00:29, 1.23it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 64% 61/96 [00:55<00:27, 1.25it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 65% 62/96 [00:56<00:27, 1.25it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 66% 63/96 [00:57<00:26, 1.25it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 67% 64/96 [00:58<00:25, 1.24it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 68% 65/96 [00:59<00:25, 1.21it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 69% 66/96 [00:59<00:24, 1.21it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 70% 67/96 [01:00<00:23, 1.22it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 71% 68/96 [01:01<00:23, 1.17it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 72% 69/96 [01:02<00:24, 1.10it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 73% 70/96 [01:03<00:24, 1.06it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 74% 71/96 [01:04<00:24, 1.03it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 75% 72/96 [01:05<00:23, 1.00it/s]\n", - "Compressing /encoder/layer.9/attention/self/value/MatMul: 100% 24/24 [00:21<00:00, 1.13it/s]\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 71%|▋| 17/24 [00:37\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 75%|▊| 18/24 [00:40\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 79%|▊| 19/24 [00:42\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 83%|▊| 20/24 [00:44\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 88%|▉| 21/24 [00:46\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 92%|▉| 22/24 [00:49\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 96%|▉| 23/24 [00:51\u001b[A\n", + "Compressing /encoder/layer.9/attention/output/dense/MatMul: 100%|█| 24/24 [00:53\u001b[A\n", "\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 76% 73/96 [01:06<00:23, 1.01s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 77% 74/96 [01:07<00:22, 1.02s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 78% 75/96 [01:08<00:19, 1.05it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 79% 76/96 [01:09<00:17, 1.11it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 80% 77/96 [01:10<00:16, 1.16it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 81% 78/96 [01:11<00:15, 1.19it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 82% 79/96 [01:11<00:13, 1.22it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 83% 80/96 [01:12<00:13, 1.23it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 84% 81/96 [01:13<00:11, 1.26it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 85% 82/96 [01:14<00:11, 1.26it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 86% 83/96 [01:14<00:10, 1.27it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 88% 84/96 [01:15<00:09, 1.26it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 89% 85/96 [01:16<00:08, 1.25it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 90% 86/96 [01:17<00:08, 1.25it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 91% 87/96 [01:18<00:07, 1.24it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 92% 88/96 [01:19<00:06, 1.22it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 93% 89/96 [01:19<00:05, 1.20it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 94% 90/96 [01:20<00:05, 1.18it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 95% 91/96 [01:21<00:04, 1.17it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 96% 92/96 [01:22<00:03, 1.10it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 97% 93/96 [01:23<00:02, 1.05it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 98% 94/96 [01:24<00:01, 1.02it/s]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 99% 95/96 [01:25<00:01, 1.01s/it]\n", - "Compressing /encoder/layer.8/output/dense/MatMul: 100% 96/96 [01:26<00:00, 1.10it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.9/attention/output/dense/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 1%| | 1/96 [00:02<03:20, 2.\u001b[A\n", + "Compressing /encoder/layer.9/intermediate/dense/MatMul: 38%|▍| 9/24 [00:19<00:3\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 2%| | 2/96 [00:04<03:26, 2.\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 4%| | 4/96 [00:08<03:13, 2.\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 5%| | 5/96 [00:10<03:08, 2.\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 6%| | 6/96 [00:13<03:29, 2.\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 7%| | 7/96 [00:15<03:28, 2.\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 8%| | 8/96 [00:17<03:20, 2.\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 9%| | 9/96 [00:19<03:13, 2.\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 10%| | 10/96 [00:22<03:12, 2\u001b[A\n", + "Compressing /encoder/layer.9/intermediate/dense/MatMul: 75%|▊| 18/24 [00:39<00:\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 12%|▏| 12/96 [00:26<03:10, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 14%|▏| 13/96 [00:29<03:06, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 15%|▏| 14/96 [00:31<03:12, 2\u001b[A\n", + "Compressing /encoder/layer.9/intermediate/dense/MatMul: 92%|▉| 22/24 [00:48<00:\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 16%|▏| 15/96 [00:34<03:21, 2\u001b[A\n", + "Compressing /encoder/layer.9/intermediate/dense/MatMul: 100%|█| 24/24 [00:53<00:\u001b[A\n", "\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 4% 1/24 [00:01<00:24, 1.06s/it]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 8% 2/24 [00:01<00:20, 1.07it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 12% 3/24 [00:02<00:18, 1.14it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 17% 4/24 [00:03<00:16, 1.18it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 21% 5/24 [00:04<00:15, 1.21it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 25% 6/24 [00:05<00:14, 1.23it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 29% 7/24 [00:05<00:13, 1.22it/s]\n", - "Compressing /encoder/layer.9/output/dense/MatMul: 7% 7/96 [00:05<01:11, 1.25it/s]\u001b[A\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 33% 8/24 [00:06<00:13, 1.22it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 38% 9/24 [00:07<00:12, 1.21it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 42% 10/24 [00:08<00:11, 1.22it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 46% 11/24 [00:09<00:10, 1.22it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 50% 12/24 [00:10<00:09, 1.22it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 54% 13/24 [00:10<00:08, 1.23it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 58% 14/24 [00:11<00:08, 1.21it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 62% 15/24 [00:12<00:07, 1.21it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 67% 16/24 [00:13<00:06, 1.18it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 71% 17/24 [00:14<00:05, 1.18it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 75% 18/24 [00:15<00:05, 1.13it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 79% 19/24 [00:16<00:04, 1.07it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 83% 20/24 [00:17<00:03, 1.02it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 88% 21/24 [00:18<00:02, 1.01it/s]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 92% 22/24 [00:19<00:02, 1.01s/it]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 96% 23/24 [00:20<00:01, 1.02s/it]\n", - "Compressing /encoder/layer.9/intermediate/dense/MatMul: 100% 24/24 [00:21<00:00, 1.12it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 4% 1/24 [00:00<00:17, 1.30it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 8% 2/24 [00:01<00:17, 1.27it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 12% 3/24 [00:02<00:16, 1.26it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 17% 4/24 [00:03<00:15, 1.28it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 21% 5/24 [00:03<00:14, 1.28it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 25% 6/24 [00:04<00:13, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 29% 7/24 [00:05<00:13, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 38% 9/24 [00:07<00:11, 1.28it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 42% 10/24 [00:07<00:10, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 46% 11/24 [00:08<00:10, 1.28it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 50% 12/24 [00:09<00:09, 1.28it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 54% 13/24 [00:10<00:08, 1.26it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 58% 14/24 [00:11<00:08, 1.21it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 62% 15/24 [00:11<00:07, 1.19it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 67% 16/24 [00:12<00:06, 1.20it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 71% 17/24 [00:13<00:06, 1.14it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 75% 18/24 [00:14<00:05, 1.08it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 79% 19/24 [00:15<00:04, 1.05it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 83% 20/24 [00:16<00:03, 1.02it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 88% 21/24 [00:17<00:02, 1.00it/s]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 92% 22/24 [00:18<00:02, 1.01s/it]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 96% 23/24 [00:19<00:01, 1.01s/it]\n", - "Compressing /encoder/layer.10/attention/self/key/MatMul: 100% 24/24 [00:20<00:00, 1.16it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 4% 1/24 [00:00<00:18, 1.26it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 8% 2/24 [00:01<00:17, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 12% 3/24 [00:02<00:16, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 17% 4/24 [00:03<00:15, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 21% 5/24 [00:03<00:14, 1.30it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 25% 6/24 [00:04<00:13, 1.32it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 29% 7/24 [00:05<00:13, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 33% 8/24 [00:06<00:12, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 38% 9/24 [00:06<00:11, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 42% 10/24 [00:07<00:10, 1.30it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 46% 11/24 [00:08<00:10, 1.28it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 50% 12/24 [00:09<00:09, 1.25it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 54% 13/24 [00:10<00:08, 1.23it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 58% 14/24 [00:11<00:08, 1.23it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 62% 15/24 [00:11<00:07, 1.21it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 67% 16/24 [00:12<00:06, 1.21it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 71% 17/24 [00:13<00:06, 1.14it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 75% 18/24 [00:14<00:05, 1.08it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 79% 19/24 [00:15<00:04, 1.04it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 83% 20/24 [00:16<00:03, 1.01it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 88% 21/24 [00:17<00:03, 1.01s/it]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 92% 22/24 [00:18<00:02, 1.02s/it]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 96% 23/24 [00:19<00:00, 1.01it/s]\n", - "Compressing /encoder/layer.10/attention/self/query/MatMul: 100% 24/24 [00:20<00:00, 1.16it/s]\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 17%|▏| 16/96 [00:36<03:14, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 18%|▏| 17/96 [00:38<03:01, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 19%|▏| 18/96 [00:41<02:59, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 21%|▏| 20/96 [00:45<02:47, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 22%|▏| 21/96 [00:47<02:40, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 23%|▏| 22/96 [00:49<02:42, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/key/MatMul: 25%|▎| 6/24 [00:13<00:\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 24%|▏| 23/96 [00:52<02:48, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 25%|▎| 24/96 [00:54<02:46, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 26%|▎| 25/96 [00:56<02:41, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 27%|▎| 26/96 [00:58<02:38, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 28%|▎| 27/96 [01:00<02:30, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 30%|▎| 29/96 [01:04<02:19, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 31%|▎| 30/96 [01:06<02:13, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 32%|▎| 31/96 [01:09<02:16, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/key/MatMul: 62%|▋| 15/24 [00:33<00\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 33%|▎| 32/96 [01:11<02:28, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 34%|▎| 33/96 [01:14<02:27, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 35%|▎| 34/96 [01:16<02:19, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 36%|▎| 35/96 [01:18<02:12, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 38%|▍| 36/96 [01:20<02:11, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 39%|▍| 37/96 [01:22<02:10, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 40%|▍| 38/96 [01:24<02:02, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 41%|▍| 39/96 [01:26<01:57, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/key/MatMul: 100%|█| 24/24 [00:51<00\u001b[A\n", "\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 4% 1/24 [00:00<00:18, 1.25it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 8% 2/24 [00:01<00:17, 1.28it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 12% 3/24 [00:02<00:16, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 17% 4/24 [00:03<00:15, 1.27it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 21% 5/24 [00:03<00:15, 1.26it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 25% 6/24 [00:04<00:14, 1.26it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 29% 7/24 [00:05<00:13, 1.26it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 33% 8/24 [00:06<00:12, 1.26it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 38% 9/24 [00:07<00:11, 1.25it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 42% 10/24 [00:07<00:11, 1.27it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 46% 11/24 [00:08<00:10, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 50% 12/24 [00:09<00:09, 1.25it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 54% 13/24 [00:10<00:09, 1.22it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 58% 14/24 [00:11<00:08, 1.21it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 62% 15/24 [00:12<00:07, 1.21it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 67% 16/24 [00:13<00:07, 1.11it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 71% 17/24 [00:14<00:06, 1.05it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 75% 18/24 [00:15<00:05, 1.02it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 79% 19/24 [00:16<00:05, 1.00s/it]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 83% 20/24 [00:17<00:04, 1.01s/it]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 88% 21/24 [00:18<00:03, 1.02s/it]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 92% 22/24 [00:19<00:01, 1.01it/s]\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 96% 23/24 [00:20<00:00, 1.08it/s]\n", - "Compressing /encoder/layer.9/output/dense/MatMul: 100% 96/96 [01:23<00:00, 1.15it/s]\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 42%|▍| 40/96 [01:28<01:54, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 43%|▍| 41/96 [01:30<01:55, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 44%|▍| 42/96 [01:33<01:54, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 45%|▍| 43/96 [01:35<01:51, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 46%|▍| 44/96 [01:37<01:51, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 48%|▍| 46/96 [01:41<01:44, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 49%|▍| 47/96 [01:43<01:41, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/query/MatMul: 29%|▎| 7/24 [00:14<0\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 50%|▌| 48/96 [01:45<01:42, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 52%|▌| 50/96 [01:49<01:37, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 53%|▌| 51/96 [01:52<01:35, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 54%|▌| 52/96 [01:54<01:34, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 55%|▌| 53/96 [01:56<01:32, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 56%|▌| 54/96 [01:58<01:29, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 58%|▌| 56/96 [02:02<01:23, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 59%|▌| 57/96 [02:04<01:23, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 60%|▌| 58/96 [02:07<01:25, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 61%|▌| 59/96 [02:10<01:28, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 62%|▋| 60/96 [02:12<01:24, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 64%|▋| 61/96 [02:14<01:20, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/query/MatMul: 83%|▊| 20/24 [00:46<\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 66%|▋| 63/96 [02:18<01:12, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 67%|▋| 64/96 [02:20<01:09, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 68%|▋| 65/96 [02:23<01:08, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/query/MatMul: 100%|█| 24/24 [00:54<\u001b[A\n", "\n", - "Compressing /encoder/layer.10/attention/self/value/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 4% 1/24 [00:00<00:17, 1.28it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 8% 2/24 [00:01<00:16, 1.30it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 17% 4/24 [00:03<00:15, 1.29it/s]\n", - "Compressing /encoder/layer.10/attention/output/dense/MatMul: 17% 4/24 [00:03<00:16, 1.24it/s]\u001b[A\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 25% 6/24 [00:04<00:14, 1.27it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 29% 7/24 [00:05<00:13, 1.28it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 33% 8/24 [00:06<00:12, 1.27it/s]\n", - "Compressing /encoder/layer.10/attention/output/dense/MatMul: 33% 8/24 [00:06<00:12, 1.26it/s]\u001b[A\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 38% 9/24 [00:07<00:11, 1.26it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 46% 11/24 [00:08<00:10, 1.22it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 50% 12/24 [00:09<00:09, 1.21it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 54% 13/24 [00:10<00:09, 1.19it/s]\n", - "Compressing /encoder/layer.10/attention/output/dense/MatMul: 54% 13/24 [00:10<00:09, 1.19it/s]\u001b[A\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 58% 14/24 [00:11<00:08, 1.18it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 67% 16/24 [00:13<00:07, 1.06it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 71% 17/24 [00:14<00:06, 1.03it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 75% 18/24 [00:15<00:05, 1.01it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 79% 19/24 [00:16<00:05, 1.00s/it]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 83% 20/24 [00:17<00:04, 1.01s/it]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 88% 21/24 [00:18<00:03, 1.00s/it]\n", - "Compressing /encoder/layer.10/attention/output/dense/MatMul: 88% 21/24 [00:18<00:02, 1.00it/s]\u001b[A\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 92% 22/24 [00:19<00:01, 1.06it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 96% 23/24 [00:20<00:00, 1.10it/s]\n", - "Compressing /encoder/layer.10/attention/output/dense/MatMul: 100% 24/24 [00:20<00:00, 1.14it/s]\n", - "Compressing /encoder/layer.10/intermediate/dense/MatMul: 100% 24/24 [00:20<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.10/output/dense/MatMul: 0% 0/96 [00:00, ?it/s]\u001b[A\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 4% 1/24 [00:00<00:18, 1.24it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 8% 2/24 [00:01<00:17, 1.24it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 12% 3/24 [00:02<00:16, 1.25it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 17% 4/24 [00:03<00:15, 1.25it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 21% 5/24 [00:04<00:15, 1.25it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 25% 6/24 [00:04<00:14, 1.24it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 29% 7/24 [00:05<00:13, 1.25it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 33% 8/24 [00:06<00:12, 1.25it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 38% 9/24 [00:07<00:11, 1.26it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 42% 10/24 [00:08<00:11, 1.24it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 46% 11/24 [00:08<00:10, 1.21it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 50% 12/24 [00:09<00:09, 1.21it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 58% 14/24 [00:11<00:08, 1.14it/s]\n", - "Compressing /encoder/layer.10/output/dense/MatMul: 15% 14/96 [00:11<01:13, 1.11it/s]\u001b[A\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 67% 16/24 [00:13<00:07, 1.04it/s]\n", - "Compressing /encoder/layer.10/output/dense/MatMul: 17% 16/96 [00:13<01:17, 1.03it/s]\u001b[A\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 71% 17/24 [00:14<00:06, 1.02it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 75% 18/24 [00:15<00:05, 1.01it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 83% 20/24 [00:17<00:03, 1.02it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 88% 21/24 [00:18<00:02, 1.09it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 92% 22/24 [00:19<00:01, 1.14it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 96% 23/24 [00:19<00:00, 1.18it/s]\n", - "Compressing /encoder/layer.11/attention/self/key/MatMul: 100% 24/24 [00:20<00:00, 1.16it/s]\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 69%|▋| 66/96 [02:25<01:08, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 70%|▋| 67/96 [02:27<01:05, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 71%|▋| 68/96 [02:29<01:01, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 72%|▋| 69/96 [02:31<00:57, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 73%|▋| 70/96 [02:33<00:53, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 74%|▋| 71/96 [02:35<00:52, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 75%|▊| 72/96 [02:38<00:51, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 76%|▊| 73/96 [02:40<00:50, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/value/MatMul: 33%|▎| 8/24 [00:16<0\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 77%|▊| 74/96 [02:42<00:48, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 78%|▊| 75/96 [02:44<00:46, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 79%|▊| 76/96 [02:47<00:44, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 80%|▊| 77/96 [02:49<00:42, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/value/MatMul: 54%|▌| 13/24 [00:26<\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 81%|▊| 78/96 [02:51<00:41, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 82%|▊| 79/96 [02:54<00:40, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 83%|▊| 80/96 [02:57<00:39, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/value/MatMul: 71%|▋| 17/24 [00:33<\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 84%|▊| 81/96 [02:59<00:38, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/value/MatMul: 79%|▊| 19/24 [00:37<\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 85%|▊| 82/96 [03:03<00:38, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 86%|▊| 83/96 [03:05<00:36, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 88%|▉| 84/96 [03:08<00:32, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 89%|▉| 85/96 [03:10<00:28, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/self/value/MatMul: 100%|█| 24/24 [00:47<\u001b[A\n", "\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 8% 2/24 [00:01<00:16, 1.31it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 12% 3/24 [00:02<00:16, 1.28it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 17% 4/24 [00:03<00:15, 1.28it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 21% 5/24 [00:03<00:14, 1.27it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 25% 6/24 [00:04<00:14, 1.27it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 29% 7/24 [00:05<00:13, 1.29it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 33% 8/24 [00:06<00:12, 1.29it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 38% 9/24 [00:07<00:11, 1.26it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 42% 10/24 [00:07<00:11, 1.25it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 46% 11/24 [00:08<00:10, 1.23it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 50% 12/24 [00:09<00:09, 1.22it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 54% 13/24 [00:10<00:09, 1.19it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 58% 14/24 [00:11<00:08, 1.11it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 62% 15/24 [00:12<00:08, 1.04it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 67% 16/24 [00:13<00:07, 1.01it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 71% 17/24 [00:14<00:07, 1.00s/it]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 75% 18/24 [00:15<00:06, 1.01s/it]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 79% 19/24 [00:16<00:05, 1.02s/it]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 83% 20/24 [00:17<00:03, 1.06it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 88% 21/24 [00:18<00:02, 1.12it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 92% 22/24 [00:19<00:01, 1.17it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 96% 23/24 [00:19<00:00, 1.20it/s]\n", - "Compressing /encoder/layer.11/attention/self/query/MatMul: 100% 24/24 [00:20<00:00, 1.17it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 4% 1/24 [00:00<00:17, 1.30it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 8% 2/24 [00:01<00:17, 1.29it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 12% 3/24 [00:02<00:16, 1.27it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 17% 4/24 [00:03<00:15, 1.26it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 21% 5/24 [00:03<00:14, 1.27it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 25% 6/24 [00:04<00:14, 1.27it/s]\n", - "Compressing /encoder/layer.10/output/dense/MatMul: 56% 54/96 [00:46<00:32, 1.29it/s]\u001b[A\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 29% 7/24 [00:05<00:13, 1.26it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 33% 8/24 [00:06<00:13, 1.22it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 38% 9/24 [00:07<00:12, 1.22it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 42% 10/24 [00:08<00:11, 1.21it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 46% 11/24 [00:09<00:11, 1.15it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 50% 12/24 [00:09<00:10, 1.12it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 54% 13/24 [00:10<00:10, 1.08it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 58% 14/24 [00:11<00:09, 1.05it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 62% 15/24 [00:13<00:08, 1.01it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 67% 16/24 [00:14<00:08, 1.00s/it]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 71% 17/24 [00:15<00:07, 1.01s/it]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 75% 18/24 [00:16<00:06, 1.02s/it]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 79% 19/24 [00:17<00:05, 1.00s/it]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 83% 20/24 [00:17<00:03, 1.06it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 88% 21/24 [00:18<00:02, 1.12it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 92% 22/24 [00:19<00:01, 1.16it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 96% 23/24 [00:20<00:00, 1.20it/s]\n", - "Compressing /encoder/layer.11/attention/self/value/MatMul: 100% 24/24 [00:21<00:00, 1.14it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 0% 0/24 [00:00, ?it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 4% 1/24 [00:00<00:17, 1.28it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 8% 2/24 [00:01<00:17, 1.27it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 12% 3/24 [00:02<00:16, 1.28it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 17% 4/24 [00:03<00:15, 1.28it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 21% 5/24 [00:03<00:14, 1.30it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 25% 6/24 [00:04<00:13, 1.29it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 29% 7/24 [00:05<00:13, 1.27it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 33% 8/24 [00:06<00:12, 1.24it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 38% 9/24 [00:07<00:12, 1.23it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 42% 10/24 [00:07<00:11, 1.21it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 46% 11/24 [00:08<00:10, 1.22it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 50% 12/24 [00:09<00:10, 1.20it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 54% 13/24 [00:10<00:09, 1.12it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 58% 14/24 [00:11<00:09, 1.06it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 62% 15/24 [00:12<00:08, 1.02it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 71% 17/24 [00:14<00:07, 1.01s/it]\n", - "Compressing /encoder/layer.10/output/dense/MatMul: 93% 89/96 [01:17<00:07, 1.02s/it]\u001b[A\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 75% 18/24 [00:15<00:06, 1.01s/it]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 83% 20/24 [00:17<00:03, 1.11it/s]\n", - "Compressing /encoder/layer.10/output/dense/MatMul: 96% 92/96 [01:20<00:03, 1.11it/s]\u001b[A\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 88% 21/24 [00:18<00:02, 1.16it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 92% 22/24 [00:19<00:01, 1.19it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 96% 23/24 [00:19<00:00, 1.21it/s]\n", - "Compressing /encoder/layer.10/output/dense/MatMul: 100% 96/96 [01:23<00:00, 1.15it/s]\n", - "Compressing /encoder/layer.11/attention/output/dense/MatMul: 100% 24/24 [00:20<00:00, 1.16it/s]\n", - "Compressing /encoder/layer.11/intermediate/dense/MatMul: 100% 24/24 [00:08<00:00, 2.88it/s]\n", - "INFO:sparsifyml.one_shot.sparsification.obcq.fast_obcq_modifier:Compressed model to effective overall sparsity of 0.49656408804434315. Full sparsity profile saved to experiments/20230925-132203/OBCQ/effective_sparsity_profile.json\n", + "Compressing /encoder/layer.10/attention/output/dense/MatMul: 0%| | 0/24 [00:00\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 90%|▉| 86/96 [03:13<00:27, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 91%|▉| 87/96 [03:16<00:25, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/output/dense/MatMul: 12%|▏| 3/24 [00:05\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 92%|▉| 88/96 [03:19<00:22, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 93%|▉| 89/96 [03:22<00:19, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/output/dense/MatMul: 25%|▎| 6/24 [00:11\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 94%|▉| 90/96 [03:25<00:16, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 95%|▉| 91/96 [03:27<00:13, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 96%|▉| 92/96 [03:30<00:10, 2\u001b[A\n", + "Compressing /encoder/layer.10/attention/output/dense/MatMul: 42%|▍| 10/24 [00:1\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 97%|▉| 93/96 [03:33<00:08, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 98%|▉| 94/96 [03:35<00:05, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 99%|▉| 95/96 [03:37<00:02, 2\u001b[A\n", + "Compressing /encoder/layer.9/output/dense/MatMul: 100%|█| 96/96 [03:39<00:00, 2\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 0%| | 0/24 [00:00, \n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 4%| | 1/24 [00:02<00:\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 12%|▏| 3/24 [00:05<00:\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 17%|▏| 4/24 [00:07<00:\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 21%|▏| 5/24 [00:09<00:\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 25%|▎| 6/24 [00:11<00:\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 29%|▎| 7/24 [00:14<00:\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 33%|▎| 8/24 [00:16<00:\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 38%|▍| 9/24 [00:18<00:\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 42%|▍| 10/24 [00:20<00\u001b[A\n", + "Compressing /encoder/layer.10/attention/output/dense/MatMul: 100%|█| 24/24 [00:4\u001b[A\n", + "\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 46%|▍| 11/24 [00:22<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 50%|▌| 12/24 [00:24<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 54%|▌| 13/24 [00:26<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 58%|▌| 14/24 [00:29<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 62%|▋| 15/24 [00:31<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 67%|▋| 16/24 [00:32<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 71%|▋| 17/24 [00:34<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 75%|▊| 18/24 [00:36<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 79%|▊| 19/24 [00:38<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 88%|▉| 21/24 [00:43<00\u001b[A\n", + "Compressing /encoder/layer.10/output/dense/MatMul: 10%| | 10/96 [00:21<03:10, \u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 92%|▉| 22/24 [00:45<00\u001b[A\n", + "Compressing /encoder/layer.10/intermediate/dense/MatMul: 100%|█| 24/24 [00:49<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 0%| | 0/24 [00:00, \n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 4%| | 1/24 [00:01<00:\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 8%| | 2/24 [00:03<00:\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 12%|▏| 3/24 [00:06<00:\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 17%|▏| 4/24 [00:08<00:\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 21%|▏| 5/24 [00:10<00:\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 25%|▎| 6/24 [00:12<00:\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 29%|▎| 7/24 [00:14<00:\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 38%|▍| 9/24 [00:18<00:\u001b[A\n", + "Compressing /encoder/layer.10/output/dense/MatMul: 22%|▏| 21/96 [00:45<02:51, \u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 42%|▍| 10/24 [00:20<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 46%|▍| 11/24 [00:22<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 50%|▌| 12/24 [00:24<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 54%|▌| 13/24 [00:27<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 62%|▋| 15/24 [00:30<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 67%|▋| 16/24 [00:32<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 71%|▋| 17/24 [00:35<00\u001b[A\n", + "Compressing /encoder/layer.10/output/dense/MatMul: 30%|▎| 29/96 [01:02<02:25, \u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 79%|▊| 19/24 [00:39<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 83%|▊| 20/24 [00:41<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 88%|▉| 21/24 [00:43<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 92%|▉| 22/24 [00:45<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 96%|▉| 23/24 [00:47<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/key/MatMul: 100%|█| 24/24 [00:49<00\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 0%| | 0/24 [00:00\n", + "Compressing /encoder/layer.10/output/dense/MatMul: 38%|▍| 36/96 [01:18<02:09, \u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 4%| | 1/24 [00:02<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 8%| | 2/24 [00:04<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 12%|▏| 3/24 [00:06<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 17%|▏| 4/24 [00:09<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 21%|▏| 5/24 [00:11<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 25%|▎| 6/24 [00:13<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 29%|▎| 7/24 [00:15<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 33%|▎| 8/24 [00:17<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 38%|▍| 9/24 [00:19<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 42%|▍| 10/24 [00:21<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 46%|▍| 11/24 [00:24<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 50%|▌| 12/24 [00:26<\u001b[A\n", + "Compressing /encoder/layer.10/output/dense/MatMul: 51%|▌| 49/96 [01:43<01:33, \u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 54%|▌| 13/24 [00:28<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 58%|▌| 14/24 [00:30<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 67%|▋| 16/24 [00:34<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 71%|▋| 17/24 [00:36<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 75%|▊| 18/24 [00:38<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 79%|▊| 19/24 [00:41<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 83%|▊| 20/24 [00:43<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 88%|▉| 21/24 [00:45<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 96%|▉| 23/24 [00:49<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/query/MatMul: 100%|█| 24/24 [00:51<\u001b[A\n", + "\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 4%| | 1/24 [00:02<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 8%| | 2/24 [00:04<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 12%|▏| 3/24 [00:06<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 17%|▏| 4/24 [00:08<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 21%|▏| 5/24 [00:10<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 25%|▎| 6/24 [00:12<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 33%|▎| 8/24 [00:15<0\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 38%|▍| 9/24 [00:17<0\u001b[A\n", + "Compressing /encoder/layer.10/output/dense/MatMul: 71%|▋| 68/96 [02:28<01:00, \u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 46%|▍| 11/24 [00:22<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 50%|▌| 12/24 [00:24<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 54%|▌| 13/24 [00:26<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 58%|▌| 14/24 [00:28<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 62%|▋| 15/24 [00:30<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 67%|▋| 16/24 [00:32<\u001b[A\n", + "Compressing /encoder/layer.10/output/dense/MatMul: 78%|▊| 75/96 [02:42<00:42, \u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 71%|▋| 17/24 [00:34<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 79%|▊| 19/24 [00:38<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 83%|▊| 20/24 [00:40<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 88%|▉| 21/24 [00:42<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 92%|▉| 22/24 [00:44<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 96%|▉| 23/24 [00:46<\u001b[A\n", + "Compressing /encoder/layer.11/attention/self/value/MatMul: 100%|█| 24/24 [00:48<\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 0%| | 0/24 [00:00\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 8%| | 2/24 [00:03\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 12%|▏| 3/24 [00:05\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 17%|▏| 4/24 [00:07\u001b[A\n", + "Compressing /encoder/layer.10/output/dense/MatMul: 90%|▉| 86/96 [03:06<00:21, \u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 25%|▎| 6/24 [00:12\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 29%|▎| 7/24 [00:14\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 33%|▎| 8/24 [00:15\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 38%|▍| 9/24 [00:18\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 42%|▍| 10/24 [00:2\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 46%|▍| 11/24 [00:2\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 50%|▌| 12/24 [00:2\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 54%|▌| 13/24 [00:2\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 58%|▌| 14/24 [00:2\u001b[A\n", + "Compressing /encoder/layer.10/output/dense/MatMul: 100%|█| 96/96 [03:28<00:00, \u001b[A\n", + "\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 67%|▋| 16/24 [00:3\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 71%|▋| 17/24 [00:3\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 75%|▊| 18/24 [00:3\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 79%|▊| 19/24 [00:3\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 83%|▊| 20/24 [00:3\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 88%|▉| 21/24 [00:4\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 92%|▉| 22/24 [00:4\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 96%|▉| 23/24 [00:4\u001b[A\n", + "Compressing /encoder/layer.11/attention/output/dense/MatMul: 100%|█| 24/24 [00:4\u001b[A\n", + "\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 38%|▍| 9/24 [00:18<00:\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 42%|▍| 10/24 [00:19<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 46%|▍| 11/24 [00:20<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 50%|▌| 12/24 [00:21<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 54%|▌| 13/24 [00:21<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 58%|▌| 14/24 [00:22<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 62%|▋| 15/24 [00:23<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 67%|▋| 16/24 [00:23<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 71%|▋| 17/24 [00:24<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 75%|▊| 18/24 [00:25<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 79%|▊| 19/24 [00:26<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 83%|▊| 20/24 [00:26<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 88%|▉| 21/24 [00:27<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 92%|▉| 22/24 [00:28<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 96%|▉| 23/24 [00:29<00\u001b[A\n", + "Compressing /encoder/layer.11/intermediate/dense/MatMul: 100%|█| 24/24 [00:29<00\u001b[A\n", + "INFO:sparsifyml.one_shot.sparsification.obcq.fast_obcq_modifier:Compressed model to effective overall sparsity of 0.014212667206187307. Full sparsity profile saved to experiments/20231114-091852/OBCQ/effective_sparsity_profile.json\n", "INFO:sparsifyml.one_shot.sparsification.obcq.fast_obcq_modifier:Running min/max calibration against dataset on 96 network activations\n", "INFO:sparsifyml.one_shot.sparsification.obcq.fast_obcq_modifier:Calibration complete\n", "INFO:sparsifyml.one_shot.sparsification.quantization.qdq:Injected Q/DQ blocks for 72 weights and 120 activations\n", - "2023-09-25 13:43:04 sparseml.exporters.transforms.onnx_transform INFO [ConstantsToInitializers] Transformed 285 matches\n", + "2023-11-14 10:05:13 sparseml.exporters.transforms.onnx_transform INFO [ConstantsToInitializers] Transformed 285 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[ConstantsToInitializers] Transformed 285 matches\n", - "2023-09-25 13:43:05 sparseml.exporters.transforms.onnx_transform INFO [FoldIdentityInitializers] Transformed 0 matches\n", + "2023-11-14 10:05:13 sparseml.exporters.transforms.onnx_transform INFO [FoldIdentityInitializers] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[FoldIdentityInitializers] Transformed 0 matches\n", - "2023-09-25 13:43:05 sparseml.exporters.transforms.onnx_transform INFO [InitializersToUint8] Transformed 168 matches\n", + "2023-11-14 10:05:13 sparseml.exporters.transforms.onnx_transform INFO [InitializersToUint8] Transformed 168 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[InitializersToUint8] Transformed 168 matches\n", - "2023-09-25 13:43:06 sparseml.exporters.transforms.onnx_transform INFO [FlattenQParams] Transformed 0 matches\n", + "2023-11-14 10:05:14 sparseml.exporters.transforms.onnx_transform INFO [FlattenQParams] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[FlattenQParams] Transformed 0 matches\n", - "2023-09-25 13:43:07 sparseml.exporters.transforms.onnx_transform INFO [FoldConvDivBn] Transformed 0 matches\n", + "2023-11-14 10:05:14 sparseml.exporters.transforms.onnx_transform INFO [FoldConvDivBn] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[FoldConvDivBn] Transformed 0 matches\n", - "2023-09-25 13:43:08 sparseml.exporters.transforms.onnx_transform INFO [DeleteRepeatedQdq] Transformed 0 matches\n", + "2023-11-14 10:05:15 sparseml.exporters.transforms.onnx_transform INFO [DeleteRepeatedQdq] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[DeleteRepeatedQdq] Transformed 0 matches\n", - "2023-09-25 13:43:09 sparseml.exporters.transforms.onnx_transform INFO [QuantizeQATEmbedding] Transformed 0 matches\n", + "2023-11-14 10:05:15 sparseml.exporters.transforms.onnx_transform INFO [QuantizeQATEmbedding] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[QuantizeQATEmbedding] Transformed 0 matches\n", - "2023-09-25 13:43:09 sparseml.exporters.transforms.onnx_transform INFO [PropagateEmbeddingQuantization] Transformed 0 matches\n", + "2023-11-14 10:05:15 sparseml.exporters.transforms.onnx_transform INFO [PropagateEmbeddingQuantization] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[PropagateEmbeddingQuantization] Transformed 0 matches\n", - "2023-09-25 13:43:10 sparseml.exporters.transforms.onnx_transform INFO [PropagateDequantThroughSplit] Transformed 0 matches\n", + "2023-11-14 10:05:16 sparseml.exporters.transforms.onnx_transform INFO [PropagateDequantThroughSplit] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[PropagateDequantThroughSplit] Transformed 0 matches\n", - "2023-09-25 13:43:11 sparseml.exporters.transforms.onnx_transform INFO [MatMulToQLinearMatMul] Transformed 0 matches\n", - "INFO:sparseml.exporters.transforms.onnx_transform:[MatMulToQLinearMatMul] Transformed 0 matches\n", - "2023-09-25 13:43:13 sparseml.exporters.transforms.onnx_transform INFO [MatMulAddToMatMulIntegerAddCastMul] Transformed 72 matches\n", + "2023-11-14 10:05:17 sparseml.exporters.transforms.onnx_transform INFO [MatMulAddToMatMulIntegerAddCastMul] Transformed 72 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[MatMulAddToMatMulIntegerAddCastMul] Transformed 72 matches\n", - "2023-09-25 13:43:14 sparseml.exporters.transforms.onnx_transform INFO [MatMulToMatMulIntegerCastMul] Transformed 24 matches\n", + "2023-11-14 10:05:18 sparseml.exporters.transforms.onnx_transform INFO [MatMulToMatMulIntegerCastMul] Transformed 24 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[MatMulToMatMulIntegerCastMul] Transformed 24 matches\n", - "2023-09-25 13:43:14 sparseml.exporters.transforms.onnx_transform INFO [FoldReLUQuants] Transformed 0 matches\n", + "2023-11-14 10:05:18 sparseml.exporters.transforms.onnx_transform INFO [FoldReLUQuants] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[FoldReLUQuants] Transformed 0 matches\n", - "2023-09-25 13:43:15 sparseml.exporters.transforms.onnx_transform INFO [ConvToConvIntegerAddCastMul] Transformed 0 matches\n", + "2023-11-14 10:05:18 sparseml.exporters.transforms.onnx_transform INFO [ConvToConvIntegerAddCastMul] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[ConvToConvIntegerAddCastMul] Transformed 0 matches\n", - "2023-09-25 13:43:15 sparseml.exporters.transforms.onnx_transform INFO [GemmToQLinearMatMul] Transformed 0 matches\n", + "2023-11-14 10:05:18 sparseml.exporters.transforms.onnx_transform INFO [GemmToQLinearMatMul] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[GemmToQLinearMatMul] Transformed 0 matches\n", - "2023-09-25 13:43:16 sparseml.exporters.transforms.onnx_transform INFO [GemmToMatMulIntegerAddCastMul] Transformed 0 matches\n", + "2023-11-14 10:05:19 sparseml.exporters.transforms.onnx_transform INFO [GemmToMatMulIntegerAddCastMul] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[GemmToMatMulIntegerAddCastMul] Transformed 0 matches\n", - "2023-09-25 13:43:16 sparseml.exporters.transforms.onnx_transform INFO [QuantizeResiduals] Transformed 0 matches\n", + "2023-11-14 10:05:19 sparseml.exporters.transforms.onnx_transform INFO [QuantizeResiduals] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[QuantizeResiduals] Transformed 0 matches\n", - "2023-09-25 13:43:17 sparseml.exporters.transforms.onnx_transform INFO [RemoveDuplicateQConvWeights] Transformed 0 matches\n", + "2023-11-14 10:05:19 sparseml.exporters.transforms.onnx_transform INFO [RemoveDuplicateQConvWeights] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[RemoveDuplicateQConvWeights] Transformed 0 matches\n", - "2023-09-25 13:43:18 sparseml.exporters.transforms.onnx_transform INFO [RemoveDuplicateQuantizeOps] Transformed 0 matches\n", + "2023-11-14 10:05:20 sparseml.exporters.transforms.onnx_transform INFO [RemoveDuplicateQuantizeOps] Transformed 0 matches\n", "INFO:sparseml.exporters.transforms.onnx_transform:[RemoveDuplicateQuantizeOps] Transformed 0 matches\n", - "INFO:sparsify.run one_shot: Model saved to deployment directory: /content/deployment\n" + "INFO:sparsify.run one_shot: Model saved to deployment directory: /home/zeroshot/nm/examples/notebooks/sparsify-bge-small/deployment\n" ] } ], "source": [ - "!sparsify.run one-shot --use-case nlp-embeddings --model ./dense-bge-small-en-v1.5/model.onnx --data ./data --optim-level 0.5" + "!sparsify.run one-shot --use-case nlp-embeddings --model ./bge-small-en-v1.5-dense/model.onnx --data ./data --optim-level 0.2" ] }, { @@ -2379,11 +1978,30 @@ "metadata": { "id": "v_GRgLzSx2qR" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + } + ], "source": [ - "!mv deployment sparse-bge-small-en-v1.5\n", - "!cp dense-bge-small-en-v1.5/tokenizer.json sparse-bge-small-en-v1.5/\n", - "!cp dense-bge-small-en-v1.5/config.json sparse-bge-small-en-v1.5/" + "!mv deployment bge-small-en-v1.5-quant\n", + "!cp bge-small-en-v1.5-dense/tokenizer.json bge-small-en-v1.5-quant/\n", + "!cp bge-small-en-v1.5-dense/config.json bge-small-en-v1.5-quant/" ] }, { @@ -2392,51 +2010,12 @@ "id": "qa-peskhdxhP" }, "source": [ - "# Create a Custom Sentence Embeddings Pipeline\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "id": "ml2t2eGcx2sn" - }, - "outputs": [], - "source": [ - "from transformers import Pipeline\n", - "import torch.nn.functional as F\n", - "import torch\n", - "\n", - "def mean_pooling(model_output, attention_mask):\n", - " token_embeddings = model_output[0]\n", - " input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n", - " return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n", - "\n", - "class SentenceEmbeddingPipeline(Pipeline):\n", - " def _sanitize_parameters(self, **kwargs):\n", - " preprocess_kwargs = {}\n", - " return preprocess_kwargs, {}, {}\n", - "\n", - " def preprocess(self, inputs):\n", - " encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')\n", - " return encoded_inputs\n", - "\n", - " def _forward(self, model_inputs):\n", - " outputs = self.model(**model_inputs)\n", - " return {\"outputs\": outputs, \"attention_mask\": model_inputs[\"attention_mask\"]}\n", - "\n", - " def postprocess(self, model_outputs):\n", - " # Perform pooling\n", - " sentence_embeddings = mean_pooling(model_outputs[\"outputs\"], model_outputs['attention_mask'])\n", - " # Normalize embeddings\n", - " sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n", - " return sentence_embeddings" + "# Testing the DeepSparseSentenceTransformers Embeddings Pipeline " ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2449,41 +2028,43 @@ "name": "stderr", "output_type": "stream", "text": [ - "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n", - "Model is dynamic and has no shapes defined, skipping reshape..\n", - "Model is dynamic and has no shapes defined, skipping reshape..\n" + "2023-11-14 10:07:38 deepsparse.engine WARNING batch_size < 1 so disabling batch size override\n", + "[nm_ort 7f340d094000 >WARN< is_supported_graph src/onnxruntime_neuralmagic/supported/ops.cc:150] Warning: Optimized runtime disabled - Detected dynamic input input_ids dim 0. Set inputs to static shapes to enable optimal performance.\n" ] }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bcc92fdee48245bfba23d24fb628789c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/1 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ - "torch.Size([1, 384])\n", - "torch.Size([1, 384])\n" + "(384,)\n" ] } ], "source": [ - "dense = \"dense-bge-small-en-v1.5\"\n", - "sparse = \"sparse-bge-small-en-v1.5\"\n", - "\n", - "dense_model = DeepSparseModelForFeatureExtraction.from_pretrained(dense, export=False)\n", - "tokenizer = get_preprocessor(dense)\n", - "\n", - "sparse_model = DeepSparseModelForFeatureExtraction.from_pretrained(sparse, export=False)\n", - "tokenizer = get_preprocessor(sparse)\n", + "from deepsparse.sentence_transformers import DeepSparseSentenceTransformer\n", "\n", - "dense_pipe = SentenceEmbeddingPipeline(model=dense_model, tokenizer=tokenizer)\n", - "sparse_pipe = SentenceEmbeddingPipeline(model=sparse_model, tokenizer=tokenizer)\n", + "quant = \"bge-small-en-v1.5-quant\"\n", + "sample_text = \"I love quantized embedding models!\"\n", "\n", - "sample_text = \"I love sparse embedding models!\"\n", - "\n", - "dense_infer = dense_pipe(sample_text)\n", - "sparse_infer = sparse_pipe(sample_text)\n", + "quant_pipe = DeepSparseSentenceTransformer(quant, export=False)\n", + "quant_infer = quant_pipe.encode(sample_text)\n", "\n", "# Get Shapes\n", - "print(dense_infer.shape)\n", - "print(sparse_infer.shape)" + "print(quant_infer.shape)" ] }, { @@ -2492,12 +2073,34 @@ "id": "OfSt2YhQeBVR" }, "source": [ - "# Evaluate the Dense vs. Sparse BGE Models for Accuracy on STSB" + "# Evaluate the Accuracy of the Dense vs. Quantized BGE Models on the STSB Dataset\n", + "\n", + "The [DeepSparseSentenceTransformer](https://github.com/neuralmagic/deepsparse/tree/main/src/deepsparse/sentence_transformers) integration allows easy access for compressed models to be evaluated on the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard). Let's compare the performance of the dense vs. quantized models on the STSB validation split:" ] }, { "cell_type": "code", "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + } + ], + "source": [ + "!pip install mteb -q" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2531,37 +2134,180 @@ "outputId": "ad8b3815-6db0-4688-b581-4e6368f0556a" }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-11-14 10:10:46 deepsparse.engine WARNING batch_size < 1 so disabling batch size override\n", + "[nm_ort 7f340d094000 >WARN< is_supported_graph src/onnxruntime_neuralmagic/supported/ops.cc:150] Warning: Optimized runtime disabled - Detected dynamic input input_ids dim 0. Set inputs to static shapes to enable optimal performance.\n" + ] + }, + { + "data": { + "text/html": [ + "
───────────────────────────────────────────────── Selected tasks ─────────────────────────────────────────────────\n", + "\n" + ], + "text/plain": [ + "\u001b[38;5;235m───────────────────────────────────────────────── \u001b[0m\u001b[1mSelected tasks \u001b[0m\u001b[38;5;235m ─────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
STS\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1mSTS\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ " - STSBenchmark, s2s\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " - STSBenchmark, \u001b[3;38;5;241ms2s\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n", + "\n", + "\n" + ], + "text/plain": [ + "\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b1a72db93db14ad0882cf299c137b957", + "model_id": "2df95ea0a5e84907a3a852cfb6b2b755", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Downloading builder script: 0%| | 0.00/5.75k [00:00, ?B/s]" + "Batches: 0%| | 0/24 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Parameter 'function'=
───────────────────────────────────────────────── Selected tasks ─────────────────────────────────────────────────\n", + "\n" + ], + "text/plain": [ + "\u001b[38;5;235m───────────────────────────────────────────────── \u001b[0m\u001b[1mSelected tasks \u001b[0m\u001b[38;5;235m ─────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
STS\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1mSTS\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ " - STSBenchmark, s2s\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " - STSBenchmark, \u001b[3;38;5;241ms2s\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n", + "\n", + "\n" + ], + "text/plain": [ + "\n", + "\n" ] }, "metadata": {}, @@ -2571,45 +2317,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "dense model: pearson=0.8913432543187466%\n", - "sparse model: pearson=0.8563085613094055%\n", - "The sparse model achieves 96.00% accuracy of the dense model\n" + "{'STSBenchmark': {'mteb_version': '1.1.1', 'dataset_revision': 'b0fddb56ed78048fa8b90373c8a3cfc37b684831', 'mteb_dataset_name': 'STSBenchmark', 'validation': {'cos_sim': {'pearson': 0.8828211766495108, 'spearman': 0.8892465763120051}, 'manhattan': {'pearson': 0.886201824808084, 'spearman': 0.8907627276162985}, 'euclidean': {'pearson': 0.8868149931196716, 'spearman': 0.8913096186609996}, 'evaluation_time': 4.43}, 'test': {'cos_sim': {'pearson': 0.8431285123201885, 'spearman': 0.8586295017067542}, 'manhattan': {'pearson': 0.854393933014824, 'spearman': 0.8591549232752812}, 'euclidean': {'pearson': 0.8565471782504085, 'spearman': 0.8612847755343875}, 'evaluation_time': 1.19}}}\n" ] } ], "source": [ - "from datasets import load_dataset\n", - "from evaluate import load\n", - "import torch\n", - "\n", - "eval_dataset = load_dataset(\"glue\",\"stsb\",split=\"validation\")\n", - "metric = load('glue', 'stsb')\n", - "\n", - "def compute_sentence_similarity(sentence_1, sentence_2, pipeline):\n", - " embedding_1 = pipeline(sentence_1)\n", - " embedding_2 = pipeline(sentence_2)\n", + "from mteb import MTEB\n", "\n", - " return torch.nn.functional.cosine_similarity(embedding_1, embedding_2, dim=1)\n", + "# Specify the model to use\n", + "quant = \"bge-small-en-v1.5-quant\"\n", + "dense = \"BAAI/bge-small-en-v1.5\"\n", "\n", - "def evaluate_stsb(example):\n", - " default = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], dense_pipe)\n", - " sparse = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], sparse_pipe)\n", - " return {\n", - " 'reference': (example[\"label\"] - 1) / (5 - 1),\n", - " 'default': float(default),\n", - " 'sparse': float(sparse),\n", - " }\n", + "# DeepSparse Model Evaluation\n", + "from deepsparse.sentence_transformers import DeepSparseSentenceTransformer\n", + "model = DeepSparseSentenceTransformer(quant, export=False)\n", + "evaluation = MTEB(tasks=[\"STSBenchmark\"])\n", + "results_ds = evaluation.run(model, output_folder=f\"results/ds-{quant}\")\n", + "print(results_ds)\n", "\n", - "# run evaluation\n", - "result = eval_dataset.map(evaluate_stsb)\n", - "\n", - "# compute metrics\n", - "default_acc = metric.compute(predictions=result[\"default\"], references=result[\"reference\"])\n", - "sparse = metric.compute(predictions=result[\"sparse\"], references=result[\"reference\"])\n", - "\n", - "print(f\"dense model: pearson={default_acc['pearson']}%\")\n", - "print(f\"sparse model: pearson={sparse['pearson']}%\")\n", - "print(f\"The sparse model achieves {round(sparse['pearson']/default_acc['pearson'],2)*100:.2f}% accuracy of the dense model\")" + "# Original SentenceTransformers Model Evaluation\n", + "import sentence_transformers\n", + "model = sentence_transformers.SentenceTransformer(dense)\n", + "evaluation = MTEB(tasks=[\"STSBenchmark\"])\n", + "results_st = evaluation.run(model, output_folder=f\"results/st-{dense}\")\n", + "print(results_st)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The quantized model achieves 99.9% recovery when compared to the dense model on MTEB's `cos_sim` `spearman` metric." ] }, { @@ -2618,93 +2356,102 @@ "id": "5MMJ8PG-eNJO" }, "source": [ - "# Benchmark the Dense PyTorch vs. Sparse ONNX Model for Latency" + "# Benchmark the Dense PyTorch vs. Quantized ONNX Model for Latency\n", + "\n", + "In addition to the MTEB benchmarking, the integration includes a custom script for benchmarking latency and throughput, let's test how the dense vs. quantized model perform against each other. First, git clone deepsparse:" ] }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'deepsparse'...\n" + ] }, - "id": "KxSXJJmceYiQ", - "outputId": "96dfe7c5-35ed-4a0b-8abd-3c4df7c7e5a8" - }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "remote: Enumerating objects: 18974, done.\u001b[K\n", + "remote: Counting objects: 100% (5600/5600), done.\u001b[K\n", + "remote: Compressing objects: 100% (1547/1547), done.\u001b[K\n", + "remote: Total 18974 (delta 4935), reused 4451 (delta 4037), pack-reused 13374\u001b[K\n", + "Receiving objects: 100% (18974/18974), 139.80 MiB | 31.52 MiB/s, done.\n", + "Resolving deltas: 100% (13356/13356), done.\n" + ] + } + ], + "source": [ + "!git clone https://github.com/neuralmagic/deepsparse.git" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, run this CLI command to benchmmark the models' latency on encoding 100 sentences on a max sequence length=512 and batch size=1:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Payload sequence length: 367\n", - "dense model latency: P95 latency (ms) - 810.6698678000611; Average latency (ms) - 359.64 +\\- 171.19;\n", - "sparse model latency: P95 latency (ms) - 375.64537654984633; Average latency (ms) - 321.86 +\\- 39.60;\n", - "Latency improvement through one-shot on 2 CPU cores: 2.16x\n" + "2023-11-14 10:13:18 deepsparse.engine WARNING batch_size < 1 so disabling batch size override\n", + "DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20231110 COMMUNITY | (6c521a73) (release) (optimized) (system=avx2_vnni, binary=avx2)\n", + "\u001b[34m[nm_ort 7f4e9f8be000 >WARN< \u001b[33m is_supported_graph src/onnxruntime_neuralmagic/supported/ops.cc:150\u001b[34m] \u001b[0mWarning: Optimized runtime disabled - Detected dynamic input input_ids dim 0. Set inputs to static shapes to enable optimal performance.\n", + "\n", + "[SentenceTransformer]\n", + "Batch size: 1, Sentence length: 700\n", + "Latency: 100 sentences in 23.41 seconds\n", + "Throughput: 4.27 sentences/second\n", + "Batches: 100%|████████████████████████████████| 100/100 [00:07<00:00, 14.13it/s]\n", + "\n", + "[DeepSparse Optimized]\n", + "Batch size: 1, Sentence length: 700\n", + "Latency: 100 sentences in 7.09 seconds\n", + "Throughput: 14.11 sentences/second\n" ] } ], "source": [ - "import subprocess\n", - "from time import perf_counter\n", - "import numpy as np\n", - "\n", - "payload = \"Greetings, I'm Jane the robot, residing in the vibrant city of Seattle, USA. \" \\\n", - " \"My journey involves crafting innovative solutions as a Software Architect, \" \\\n", - " \"driving technological progress through collaborative endeavors and cutting-edge research. \" \\\n", - " \"My experience spans across diverse domains, from optimizing supply chain logistics \" \\\n", - " \"to enhancing medical diagnostics. Passionate about exploring AI ethics and \" \\\n", - " \"the human-machine partnership, I'm constantly evolving to pioneer the future of technology. \" \\\n", - " \"In my spare time, I enjoy exploring the beautiful Pacific Northwest, \" \\\n", - " \"with its majestic mountains and pristine forests. I'm an avid hiker and often find \" \\\n", - " \"myself on the trails, seeking inspiration from nature's wonders. \" \\\n", - " \"When it comes to my work, I believe that artificial intelligence \" \\\n", - " \"has the potential to transform industries and improve people's lives. \" \\\n", - " \"I'm particularly interested in natural language processing and \" \\\n", - " \"machine learning, and I'm dedicated to pushing the boundaries of what AI can achieve. \" \\\n", - " \"In addition to my technical pursuits, I'm also a strong advocate \" \\\n", - " \"for diversity and inclusion in the tech industry. I believe that a diverse \" \\\n", - " \"and inclusive workforce leads to better innovation and more equitable \" \\\n", - " \"technological solutions for society. \" \\\n", - " \"I'm an enthusiastic problem solver and love tackling complex challenges. \" \\\n", - " \"My approach to problem-solving involves a combination of creativity, \" \\\n", - " \"data-driven analysis, and a keen understanding of user needs. \" \\\n", - " \"I'm always eager to collaborate with like-minded individuals \" \\\n", - " \"to bring innovative ideas to life. \" \\\n", - " \"When I'm not working on AI projects or exploring the outdoors, \" \\\n", - " \"I can often be found in the kitchen, experimenting with new recipes \" \\\n", - " \"and cooking up delicious meals for friends and family. \" \\\n", - " \"I believe that the joy of creating extends beyond technology \" \\\n", - " \"and into the realms of culinary art. \" \\\n", - " \"My aspiration is to continue pushing the boundaries \" \\\n", - " \"of what AI can achieve while making a positive impact on society.\"\n", - "\n", - "print(f'Payload sequence length: {len(tokenizer(payload)[\"input_ids\"])}')\n", - "\n", - "def measure_latency(pipe):\n", - " latencies = []\n", - "\n", - " # Timed run\n", - " for _ in range(100):\n", - " start_time = perf_counter()\n", - " _ = pipe(payload)\n", - " latency = perf_counter() - start_time\n", - " latencies.append(latency)\n", - "\n", - " # Compute run statistics\n", - " time_avg_ms = 1000 * np.mean(latencies)\n", - " time_std_ms = 1000 * np.std(latencies)\n", - " time_p95_ms = 1000 * np.percentile(latencies, 95)\n", - " return f\"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\\- {time_std_ms:.2f};\", time_p95_ms\n", - "\n", - "dense_model = measure_latency(dense_pipe)\n", - "quantized_model = measure_latency(sparse_pipe)\n", - "\n", - "# Get the number of CPU cores using the nproc command\n", - "num_cores = int(subprocess.check_output(\"nproc\").decode().strip())\n", - "\n", - "print(f\"dense model latency: {dense_model[0]}\")\n", - "print(f\"sparse model latency: {quantized_model[0]}\")\n", - "print(f\"Latency improvement through one-shot on {num_cores} CPU cores: {round(dense_model[1] / quantized_model[1], 2)}x\")\n" + "!python deepsparse/src/deepsparse/sentence_transformers/benchmark_encoding.py --base_model BAAI/bge-small-en-v1.5 --sparse_model bge-small-en-v1.5-quant" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The quantized BGE model is able to improve latency performance against the dense variant on a 10 core laptop by 3.3X! Furthermore, on optimized hardware, especially avx512 with VNNI instructions, up to 5X improvement can be observed." ] } ], @@ -2719,7 +2466,16 @@ "name": "python3" }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" }, "widgets": { "application/vnd.jupyter.widget-state+json": {