diff --git a/docs/source/tutorial/1_Embedding.rst b/docs/source/tutorial/1_Embedding.rst
index f68ea30f..17002bef 100644
--- a/docs/source/tutorial/1_Embedding.rst
+++ b/docs/source/tutorial/1_Embedding.rst
@@ -9,4 +9,6 @@
    1_Embedding/1.1.1
    1_Embedding/1.2.1
    1_Embedding/1.2.2
-   1_Embedding/1.2.3
\ No newline at end of file
+   1_Embedding/1.2.3
+   1_Embedding/1.2.4
+   1_Embedding/1.2.5
\ No newline at end of file
diff --git a/docs/source/tutorial/1_Embedding/1.1.1.ipynb b/docs/source/tutorial/1_Embedding/1.1.1.ipynb
index a3de317e..e1b61731 100644
--- a/docs/source/tutorial/1_Embedding/1.1.1.ipynb
+++ b/docs/source/tutorial/1_Embedding/1.1.1.ipynb
@@ -83,6 +83,18 @@
     "%pip install -U FlagEmbedding sentence_transformers openai cohere"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'\n",
+    "# single GPU is better for small tasks\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -92,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -138,9 +150,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "initial target device: 100%|██████████| 8/8 [00:31<00:00,  3.89s/it]\n",
+      "Chunks: 100%|██████████| 3/3 [00:04<00:00,  1.61s/it]\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -148,9 +168,9 @@
       "Embeddings:\n",
       "(3, 768)\n",
       "Similarity scores:\n",
-      "[[1.         0.7900386  0.57525384]\n",
-      " [0.7900386  0.9999998  0.59190154]\n",
-      " [0.57525384 0.59190154 0.99999994]]\n"
+      "[[1.     0.79   0.575 ]\n",
+      " [0.79   0.9995 0.592 ]\n",
+      " [0.575  0.592  0.999 ]]\n"
      ]
     }
    ],
@@ -373,7 +393,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "dev",
    "language": "python",
    "name": "python3"
   },
@@ -387,7 +407,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/source/tutorial/1_Embedding/1.2.1.ipynb b/docs/source/tutorial/1_Embedding/1.2.1.ipynb
index 39d5cf07..f56c4f50 100644
--- a/docs/source/tutorial/1_Embedding/1.2.1.ipynb
+++ b/docs/source/tutorial/1_Embedding/1.2.1.ipynb
@@ -53,6 +53,19 @@
     "%pip install -U FlagEmbedding"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a2376217",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'\n",
+    "# single GPU is better for small tasks\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "bc6e30a0",
@@ -92,7 +105,7 @@
    "source": [
     "| Model  | Language |   Parameters   |   Model Size   |    Description    |   Base Model     |\n",
     "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n",
-    "| [BAAI/bge-large-en](https://huggingface.co/BAAI/bge-large-en)   | English |    335M    |    1.34 GB   |              Embedding Model which map text into vector                            |  BERT  |\n",
+    "| [BAAI/bge-large-en](https://huggingface.co/BAAI/bge-large-en)   | English |    500M    |    1.34 GB   |              Embedding Model which map text into vector                            |  BERT  |\n",
     "| [BAAI/bge-base-en](https://huggingface.co/BAAI/bge-base-en)     | English |    109M    |    438 MB    |          a base-scale model but with similar ability to `bge-large-en`  |  BERT  |\n",
     "| [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en)   | English |    33.4M   |    133 MB    |          a small-scale model but with competitive performance                    |  BERT  |\n",
     "| [BAAI/bge-large-zh](https://huggingface.co/BAAI/bge-large-zh)   | Chinese |    326M    |    1.3 GB    |              Embedding Model which map text into vector                            |  BERT  |\n",
@@ -105,7 +118,7 @@
    "id": "c9c45d17",
    "metadata": {},
    "source": [
-    "For inference, import FlagModel from FlagEmbedding and initialize the model."
+    "For inference, simply import FlagModel from FlagEmbedding and initialize the model."
    ]
   },
   {
@@ -113,21 +126,32 @@
    "execution_count": null,
    "id": "89e07751",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.84864    0.7946737 ]\n",
+      " [0.760097   0.85449743]]\n"
+     ]
+    }
+   ],
    "source": [
     "from FlagEmbedding import FlagModel\n",
     "\n",
     "# Load BGE model\n",
-    "model = FlagModel('BAAI/bge-base-en',\n",
-    "                  query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",
-    "                  use_fp16=True)\n",
+    "model = FlagModel(\n",
+    "    'BAAI/bge-base-en',\n",
+    "    query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",
+    "    query_instruction_format='{}{}',\n",
+    ")\n",
     "\n",
     "queries = [\"query 1\", \"query 2\"]\n",
     "corpus = [\"passage 1\", \"passage 2\"]\n",
     "\n",
     "# encode the queries and corpus\n",
-    "q_embeddings = model.encode(queries)\n",
-    "p_embeddings = model.encode(corpus)\n",
+    "q_embeddings = model.encode_queries(queries)\n",
+    "p_embeddings = model.encode_corpus(corpus)\n",
     "\n",
     "# compute the similarity scores\n",
     "scores = q_embeddings @ p_embeddings.T\n",
@@ -139,15 +163,18 @@
    "id": "6c8e69ed",
    "metadata": {},
    "source": [
-    "To use `FlagModel`:\n",
-    "```\n",
+    "For general encoding, use either `encode()`:\n",
+    "```python\n",
     "FlagModel.encode(sentences, batch_size=256, max_length=512, convert_to_numpy=True)\n",
     "```\n",
-    "The *encode()* function directly encode the input sentences to embedding vectors.\n",
-    "```\n",
-    "FlagModel.encode_queries(sentences, batch_size=256, max_length=512, convert_to_numpy=True)\n",
+    "or `encode_corpus()` that directly calls `encode()`:\n",
+    "```python\n",
+    "FlagModel.encode_corpus(corpus, batch_size=256, max_length=512, convert_to_numpy=True)\n",
     "```\n",
-    "The *encode_queries()* function concatenate the `query_instruction_for_retrieval` with each of the input query, and then call `encode()`."
+    "The *encode_queries()* function concatenate the `query_instruction_for_retrieval` with each of the input query to form the new sentences and then feed them to `encode()`.\n",
+    "```python\n",
+    "FlagModel.encode_queries(queries, batch_size=256, max_length=512, convert_to_numpy=True)\n",
+    "```"
    ]
   },
   {
@@ -186,140 +213,64 @@
    "id": "ed00c504",
    "metadata": {},
    "source": [
-    "BGE 1.5 models shares the same API of `FlagModel` with BGE models."
+    "You can use BGE 1.5 models exactly same to BGE v1 models."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "9b17afcc",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 2252.58it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 3575.71it/s]"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[[0.736794  0.5989914]\n",
-      " [0.5684842 0.7461165]]\n"
+      "[[0.76   0.6714]\n",
+      " [0.6177 0.7603]]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
      ]
     }
    ],
    "source": [
-    "model = FlagModel('BAAI/bge-base-en-v1.5',\n",
-    "                  query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",
-    "                  use_fp16=True)\n",
+    "model = FlagModel(\n",
+    "    'BAAI/bge-base-en-v1.5',\n",
+    "    query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",
+    "    query_instruction_format='{}{}'\n",
+    ")\n",
     "\n",
     "queries = [\"query 1\", \"query 2\"]\n",
     "corpus = [\"passage 1\", \"passage 2\"]\n",
     "\n",
     "# encode the queries and corpus\n",
-    "q_embeddings = model.encode(queries)\n",
-    "p_embeddings = model.encode(corpus)\n",
+    "q_embeddings = model.encode_queries(queries)\n",
+    "p_embeddings = model.encode_corpus(corpus)\n",
     "\n",
     "# compute the similarity scores\n",
     "scores = q_embeddings @ p_embeddings.T\n",
     "print(scores)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "38c3ce1c",
-   "metadata": {},
-   "source": [
-    "### 2.3 LLM-Embedder"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1bc3fee0",
-   "metadata": {},
-   "source": [
-    "LLM-Embedder is a unified embedding model supporting diverse retrieval augmentation needs for LLMs. It is fine-tuned over 6 tasks:\n",
-    "- Question Answering (qa)\n",
-    "- Conversational Search (convsearch)\n",
-    "- Long Conversation (chat)\n",
-    "- Long-Rnage Language Modeling (lrlm)\n",
-    "- In-Context Learning (icl)\n",
-    "- Tool Learning (tool)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "13b926e9",
-   "metadata": {},
-   "source": [
-    "| Model  | Language |   Parameters   |   Model Size   |    Description    |   Base Model     |\n",
-    "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n",
-    "| [BAAI/llm-embedder](https://huggingface.co/BAAI/llm-embedder)             |   English | 109M |  438 MB  |      a unified embedding model to support diverse retrieval augmentation needs for LLMs       | BERT |"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a7b3f109",
-   "metadata": {},
-   "source": [
-    "To use `LLMEmbedder`:\n",
-    "```python\n",
-    "LLMEmbedder.encode_queries(\n",
-    "    queries, \n",
-    "    batch_size=256, \n",
-    "    max_length=256, \n",
-    "    task='qa'\n",
-    ")\n",
-    "```\n",
-    "The *encode_queries()* will call the *_encode()* functions (similar to the *encode()* in `FlagModel`) and add the corresponding query instruction of the given *task* in front of each of the input *queries*.\n",
-    "```python\n",
-    "LLMEmbedder.encode_keys(\n",
-    "    keys, \n",
-    "    batch_size=256, \n",
-    "    max_length=512, \n",
-    "    task='qa'\n",
-    ")\n",
-    "```\n",
-    "Similarly, *encode_keys()* also calls *_encode()* and automatically add instructions according to given task."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "5f077420",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[0.89705944 0.85341793]\n",
-      " [0.8462474  0.90914035]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "from FlagEmbedding import LLMEmbedder\n",
-    "\n",
-    "# load the LLMEmbedder model\n",
-    "model = LLMEmbedder('BAAI/llm-embedder', use_fp16=False)\n",
-    "\n",
-    "# Define queries and keys\n",
-    "queries = [\"test query 1\", \"test query 2\"]\n",
-    "keys = [\"test key 1\", \"test key 2\"]\n",
-    "\n",
-    "# Encode for a specific task (qa, icl, chat, lrlm, tool, convsearch)\n",
-    "task = \"qa\"\n",
-    "query_embeddings = model.encode_queries(queries, task=task)\n",
-    "key_embeddings = model.encode_keys(keys, task=task)\n",
-    "\n",
-    "# compute the similarity scores\n",
-    "similarity = query_embeddings @ key_embeddings.T\n",
-    "print(similarity)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "dcf2a82b",
    "metadata": {},
    "source": [
-    "### 2.4 BGE M3"
+    "### 2.3 BGE M3"
    ]
   },
   {
@@ -347,7 +298,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "d4647625",
    "metadata": {},
    "outputs": [
@@ -355,7 +306,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 228780.22it/s]\n"
+      "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 194180.74it/s]\n"
      ]
     }
    ],
@@ -385,19 +336,27 @@
     "It returns a dictionary like:\n",
     "```python\n",
     "{\n",
-    "    'dense_vecs': 'array of dense embeddings of inputs if return_dense=True, otherwise None,'\n",
-    "    'lexical_weights': 'array of dictionaries with keys and values are ids of tokens and their corresponding weights if return_sparse=True, otherwise None,'\n",
-    "    'colbert_vecs': 'array of multi-vector embeddings of inputs if return_cobert_vecs=True, otherwise None,'\n",
+    "    'dense_vecs':       # array of dense embeddings of inputs if return_dense=True, otherwise None,\n",
+    "    'lexical_weights':  # array of dictionaries with keys and values are ids of tokens and their corresponding weights if return_sparse=True, otherwise None,\n",
+    "    'colbert_vecs':     # array of multi-vector embeddings of inputs if return_cobert_vecs=True, otherwise None,'\n",
     "}\n",
     "```"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "f0b11cf0",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 1148.18it/s]\n"
+     ]
+    }
+   ],
    "source": [
     "# If you don't need such a long length of 8192 input tokens, you can set max_length to a smaller value to speed up encoding.\n",
     "embeddings = model.encode(\n",
@@ -411,7 +370,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "id": "72cba126",
    "metadata": {},
    "outputs": [
@@ -420,38 +379,36 @@
      "output_type": "stream",
      "text": [
       "dense embedding:\n",
-      "[[-0.03411707 -0.04707828 -0.00089447 ...  0.04828531  0.00755427\n",
-      "  -0.02961654]\n",
-      " [-0.01041734 -0.04479263 -0.02429199 ... -0.00819298  0.01503995\n",
-      "   0.01113793]]\n",
+      "[[-0.03412  -0.04706  -0.00087  ...  0.04822   0.007614 -0.02957 ]\n",
+      " [-0.01035  -0.04483  -0.02434  ... -0.008224  0.01497   0.011055]]\n",
       "sparse embedding:\n",
-      "[defaultdict(<class 'int'>, {'4865': 0.08362077, '83': 0.081469566, '335': 0.12964639, '11679': 0.25186998, '276': 0.17001738, '363': 0.26957875, '32': 0.040755156}), defaultdict(<class 'int'>, {'262': 0.050144322, '5983': 0.13689369, '2320': 0.045134712, '111': 0.06342201, '90017': 0.25167602, '2588': 0.33353207})]\n",
+      "[defaultdict(<class 'int'>, {'4865': np.float16(0.0836), '83': np.float16(0.0814), '335': np.float16(0.1296), '11679': np.float16(0.2517), '276': np.float16(0.1699), '363': np.float16(0.2695), '32': np.float16(0.04077)}), defaultdict(<class 'int'>, {'262': np.float16(0.05014), '5983': np.float16(0.1367), '2320': np.float16(0.04517), '111': np.float16(0.0634), '90017': np.float16(0.2517), '2588': np.float16(0.3333)})]\n",
       "multi-vector:\n",
-      "[array([[-8.6726490e-03, -4.8921868e-02, -3.0449261e-03, ...,\n",
-      "        -2.2082448e-02,  5.7268854e-02,  1.2811369e-02],\n",
-      "       [-8.8765034e-03, -4.6860173e-02, -9.5845405e-03, ...,\n",
-      "        -3.1404708e-02,  5.3911421e-02,  6.8714428e-03],\n",
-      "       [ 1.8445771e-02, -4.2359587e-02,  8.6754939e-04, ...,\n",
-      "        -1.9803897e-02,  3.8384371e-02,  7.6852231e-03],\n",
+      "[array([[-8.68966337e-03, -4.89266850e-02, -3.03634931e-03, ...,\n",
+      "        -2.21243706e-02,  5.72856329e-02,  1.28355855e-02],\n",
+      "       [-8.92937183e-03, -4.67235669e-02, -9.52814799e-03, ...,\n",
+      "        -3.14785317e-02,  5.39088845e-02,  6.96671568e-03],\n",
+      "       [ 1.84195358e-02, -4.22310382e-02,  8.55499704e-04, ...,\n",
+      "        -1.97946690e-02,  3.84313315e-02,  7.71250250e-03],\n",
       "       ...,\n",
-      "       [-2.5543230e-02, -1.6561864e-02, -4.2125367e-02, ...,\n",
-      "        -4.5030322e-02,  4.4091221e-02, -1.0043185e-02],\n",
-      "       [ 4.9905590e-05, -5.5475257e-02,  8.4884483e-03, ...,\n",
-      "        -2.2911752e-02,  6.0379632e-02,  9.3577225e-03],\n",
-      "       [ 2.5895271e-03, -2.9331330e-02, -1.8961012e-02, ...,\n",
-      "        -8.0389353e-03,  3.2842189e-02,  4.3894034e-02]], dtype=float32), array([[ 0.01715658,  0.03835309, -0.02311821, ...,  0.00146474,\n",
-      "         0.02993429, -0.05985384],\n",
-      "       [ 0.00996143,  0.039217  , -0.03855301, ...,  0.00599566,\n",
-      "         0.02722942, -0.06509776],\n",
-      "       [ 0.01777726,  0.03919311, -0.01709837, ...,  0.00805702,\n",
-      "         0.03988946, -0.05069073],\n",
+      "       [-2.55824160e-02, -1.65533274e-02, -4.21357416e-02, ...,\n",
+      "        -4.50234264e-02,  4.41286489e-02, -1.00052059e-02],\n",
+      "       [ 5.90990965e-07, -5.53734899e-02,  8.51499755e-03, ...,\n",
+      "        -2.29209941e-02,  6.04418293e-02,  9.39912070e-03],\n",
+      "       [ 2.57394509e-03, -2.92690992e-02, -1.89342294e-02, ...,\n",
+      "        -8.04431178e-03,  3.28964666e-02,  4.38723788e-02]], dtype=float32), array([[ 0.01724418,  0.03835401, -0.02309308, ...,  0.00141706,\n",
+      "         0.02995041, -0.05990082],\n",
+      "       [ 0.00996325,  0.03922409, -0.03849588, ...,  0.00591671,\n",
+      "         0.02722516, -0.06510868],\n",
+      "       [ 0.01781915,  0.03925728, -0.01710397, ...,  0.00801776,\n",
+      "         0.03987768, -0.05070014],\n",
       "       ...,\n",
-      "       [ 0.05474931,  0.0075684 ,  0.00329455, ..., -0.01651684,\n",
-      "         0.02397249,  0.00368039],\n",
-      "       [ 0.0093503 ,  0.05022853, -0.02385841, ...,  0.02575599,\n",
-      "         0.00786822, -0.03260205],\n",
-      "       [ 0.01805054,  0.01337725,  0.00016697, ...,  0.01843987,\n",
-      "         0.01374448,  0.00310114]], dtype=float32)]\n"
+      "       [ 0.05478653,  0.00755799,  0.00328444, ..., -0.01648209,\n",
+      "         0.02405782,  0.00363262],\n",
+      "       [ 0.00936953,  0.05028074, -0.02388872, ...,  0.02567679,\n",
+      "         0.00791224, -0.03257877],\n",
+      "       [ 0.01803976,  0.0133922 ,  0.00019365, ...,  0.0184015 ,\n",
+      "         0.01373822,  0.00315539]], dtype=float32)]\n"
      ]
     }
    ],
@@ -460,11 +417,174 @@
     "print(f\"sparse embedding:\\n{embeddings['lexical_weights']}\")\n",
     "print(f\"multi-vector:\\n{embeddings['colbert_vecs']}\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14d83caa",
+   "metadata": {},
+   "source": [
+    "### 2.4 BGE Multilingual Gemma2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd4c67df",
+   "metadata": {},
+   "source": [
+    "BGE Multilingual Gemma2 is a LLM-based Multi-Lingual embedding model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "abdca22e",
+   "metadata": {},
+   "source": [
+    "| Model  | Language |   Parameters   |   Model Size   |    Description    |   Base Model     |\n",
+    "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n",
+    "| [BAAI/bge-multilingual-gemma2](https://huggingface.co/BAAI/bge-multilingual-gemma2)                   |    Multilingual     |   9.24B   |  37 GB  |  LLM-based multilingual embedding model with SOTA results on multilingual benchmarks | Gemma2-9B |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "8ec545bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  6.34it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 816.49it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 718.33it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.559     0.01685  ]\n",
+      " [0.0008683 0.5015   ]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import FlagLLMModel\n",
+    "\n",
+    "queries = [\"how much protein should a female eat\", \"summit define\"]\n",
+    "documents = [\n",
+    "    \"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.\",\n",
+    "    \"Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.\"\n",
+    "]\n",
+    "\n",
+    "model = FlagLLMModel('BAAI/bge-multilingual-gemma2', \n",
+    "                     query_instruction_for_retrieval=\"Given a web search query, retrieve relevant passages that answer the query.\",\n",
+    "                     use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation\n",
+    "\n",
+    "embeddings_1 = model.encode_queries(queries)\n",
+    "embeddings_2 = model.encode_corpus(documents)\n",
+    "similarity = embeddings_1 @ embeddings_2.T\n",
+    "print(similarity)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8b7b2aa4",
+   "metadata": {},
+   "source": [
+    "### 2.4 BGE ICL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7c9acb92",
+   "metadata": {},
+   "source": [
+    "BGE ICL stands for in-context learning. By providing few-shot examples in the query, it can significantly enhance the model's ability to handle new tasks."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf6c9345",
+   "metadata": {},
+   "source": [
+    "| Model  | Language |   Parameters   |   Model Size   |    Description    |   Base Model     |\n",
+    "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n",
+    "| [BAAI/bge-en-icl](https://huggingface.co/BAAI/bge-en-icl)                   |    English     |   7.11B   |  28.5 GB  |  LLM-based English embedding model with excellent in-context learning ability. | Mistral-7B |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "4595bae7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents = [\n",
+    "    \"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.\",\n",
+    "    \"Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.\"\n",
+    "]\n",
+    "\n",
+    "examples = [\n",
+    "    {\n",
+    "        'instruct': 'Given a web search query, retrieve relevant passages that answer the query.',\n",
+    "        'query': 'what is a virtual interface',\n",
+    "        'response': \"A virtual interface is a software-defined abstraction that mimics the behavior and characteristics of a physical network interface. It allows multiple logical network connections to share the same physical network interface, enabling efficient utilization of network resources. Virtual interfaces are commonly used in virtualization technologies such as virtual machines and containers to provide network connectivity without requiring dedicated hardware. They facilitate flexible network configurations and help in isolating network traffic for security and management purposes.\"\n",
+    "    },\n",
+    "    {\n",
+    "        'instruct': 'Given a web search query, retrieve relevant passages that answer the query.',\n",
+    "        'query': 'causes of back pain in female for a week',\n",
+    "        'response': \"Back pain in females lasting a week can stem from various factors. Common causes include muscle strain due to lifting heavy objects or improper posture, spinal issues like herniated discs or osteoporosis, menstrual cramps causing referred pain, urinary tract infections, or pelvic inflammatory disease. Pregnancy-related changes can also contribute. Stress and lack of physical activity may exacerbate symptoms. Proper diagnosis by a healthcare professional is crucial for effective treatment and management.\"\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "queries = [\"how much protein should a female eat\", \"summit define\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffb586c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  6.55it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 366.09it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 623.69it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.6064 0.3018]\n",
+      " [0.257  0.537 ]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import FlagICLModel\n",
+    "import os\n",
+    "\n",
+    "model = FlagICLModel('BAAI/bge-en-icl', \n",
+    "                     examples_for_task=examples,  # set `examples_for_task=None` to use model without examples\n",
+    "                    #  examples_instruction_format=\"<instruct>{}\\n<query>{}\\n<response>{}\" # specify the format to use examples_for_task\n",
+    "                     )\n",
+    "\n",
+    "embeddings_1 = model.encode_queries(queries)\n",
+    "embeddings_2 = model.encode_corpus(documents)\n",
+    "similarity = embeddings_1 @ embeddings_2.T\n",
+    "\n",
+    "print(similarity)"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "dev",
    "language": "python",
    "name": "python3"
   },
@@ -478,7 +598,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/source/tutorial/1_Embedding/1.2.2.ipynb b/docs/source/tutorial/1_Embedding/1.2.2.ipynb
index dbe94b89..3bcb4843 100644
--- a/docs/source/tutorial/1_Embedding/1.2.2.ipynb
+++ b/docs/source/tutorial/1_Embedding/1.2.2.ipynb
@@ -4,400 +4,258 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# BGE Explanation"
+    "# BGE Auto Embedder"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In this section, we will go through BGE and BGE-v1.5's structure and how they generate embeddings."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 0. Installation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Install the required packages in your environment."
+    "FlagEmbedding provides a high level class `FlagAutoModel` that unify the inference of embedding models. Besides BGE series, it also supports other popular open-source embedding models such as E5, GTE, SFR, etc. In this tutorial, we will have an idea how to use it."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%capture\n",
-    "%pip install -U transformers FlagEmbedding"
+    "% pip install FlagEmbedding"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 1. Encode sentences"
+    "## 1. Usage"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To know how exactly a sentence is encoded, let's first load the tokenizer and model from HF transformers instead of FlagEmbedding"
+    "First, import `FlagAutoModel` from FlagEmbedding, and use the `from_finetuned()` function to initialize the model:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from transformers import AutoTokenizer, AutoModel\n",
-    "import torch\n",
-    "\n",
-    "tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-base-en-v1.5\")\n",
-    "model = AutoModel.from_pretrained(\"BAAI/bge-base-en-v1.5\")\n",
+    "from FlagEmbedding import FlagAutoModel\n",
     "\n",
-    "sentences = [\"embedding\", \"I love machine learning and nlp\"]"
+    "model = FlagAutoModel.from_finetuned(\n",
+    "    'BAAI/bge-base-en-v1.5',\n",
+    "    query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages: \",\n",
+    "    devices=\"cuda:0\",   # if not specified, will use all available gpus or cpu when no gpu available\n",
+    ")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n",
-    "\n",
-    "Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure."
+    "Then use the model exactly same to `FlagModel` (`FlagM3Model` if using BGE M3, `FlagLLMModel` if using BGE Multilingual Gemma2, `FlagICLModel` if using BGE ICL)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "BertModel(\n",
-       "  (embeddings): BertEmbeddings(\n",
-       "    (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
-       "    (position_embeddings): Embedding(512, 768)\n",
-       "    (token_type_embeddings): Embedding(2, 768)\n",
-       "    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
-       "    (dropout): Dropout(p=0.1, inplace=False)\n",
-       "  )\n",
-       "  (encoder): BertEncoder(\n",
-       "    (layer): ModuleList(\n",
-       "      (0-11): 12 x BertLayer(\n",
-       "        (attention): BertAttention(\n",
-       "          (self): BertSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "          (output): BertSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BertIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "          (intermediate_act_fn): GELUActivation()\n",
-       "        )\n",
-       "        (output): BertOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
-       "          (dropout): Dropout(p=0.1, inplace=False)\n",
-       "        )\n",
-       "      )\n",
-       "    )\n",
-       "  )\n",
-       "  (pooler): BertPooler(\n",
-       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "    (activation): Tanh()\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.76   0.6714]\n",
+      " [0.6177 0.7603]]\n"
+     ]
     }
    ],
    "source": [
-    "model.eval()"
+    "queries = [\"query 1\", \"query 2\"]\n",
+    "corpus = [\"passage 1\", \"passage 2\"]\n",
+    "\n",
+    "# encode the queries and corpus\n",
+    "q_embeddings = model.encode_queries(queries)\n",
+    "p_embeddings = model.encode_corpus(corpus)\n",
+    "\n",
+    "# compute the similarity scores\n",
+    "scores = q_embeddings @ p_embeddings.T\n",
+    "print(scores)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, let's tokenize the sentences."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input_ids': tensor([[  101,  7861,  8270,  4667,   102,     0,     0,     0,     0],\n",
-       "        [  101,  1045,  2293,  3698,  4083,  1998, 17953,  2361,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-       "        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0],\n",
-       "        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "inputs = tokenizer(\n",
-    "    sentences, \n",
-    "    padding=True, \n",
-    "    truncation=True, \n",
-    "    return_tensors='pt', \n",
-    "    max_length=512\n",
-    ")\n",
-    "inputs"
+    "## 2. Explanation"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "From the results, we can see that each sentence begins with token 101 and ends with 102, they are the `[CLS]` and `[SEP]` special token used in BERT."
+    "`FlagAutoModel` use an OrderedDict `MODEL_MAPPING` to store all the supported models configuration:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "torch.Size([2, 9, 768])"
+       "['bge-en-icl',\n",
+       " 'bge-multilingual-gemma2',\n",
+       " 'bge-m3',\n",
+       " 'bge-large-en-v1.5',\n",
+       " 'bge-base-en-v1.5',\n",
+       " 'bge-small-en-v1.5',\n",
+       " 'bge-large-zh-v1.5',\n",
+       " 'bge-base-zh-v1.5',\n",
+       " 'bge-small-zh-v1.5',\n",
+       " 'bge-large-en',\n",
+       " 'bge-base-en',\n",
+       " 'bge-small-en',\n",
+       " 'bge-large-zh',\n",
+       " 'bge-base-zh',\n",
+       " 'bge-small-zh',\n",
+       " 'e5-mistral-7b-instruct',\n",
+       " 'e5-large-v2',\n",
+       " 'e5-base-v2',\n",
+       " 'e5-small-v2',\n",
+       " 'multilingual-e5-large-instruct',\n",
+       " 'multilingual-e5-large',\n",
+       " 'multilingual-e5-base',\n",
+       " 'multilingual-e5-small',\n",
+       " 'e5-large',\n",
+       " 'e5-base',\n",
+       " 'e5-small',\n",
+       " 'gte-Qwen2-7B-instruct',\n",
+       " 'gte-Qwen2-1.5B-instruct',\n",
+       " 'gte-Qwen1.5-7B-instruct',\n",
+       " 'gte-multilingual-base',\n",
+       " 'gte-large-en-v1.5',\n",
+       " 'gte-base-en-v1.5',\n",
+       " 'gte-large',\n",
+       " 'gte-base',\n",
+       " 'gte-small',\n",
+       " 'gte-large-zh',\n",
+       " 'gte-base-zh',\n",
+       " 'gte-small-zh',\n",
+       " 'SFR-Embedding-2_R',\n",
+       " 'SFR-Embedding-Mistral',\n",
+       " 'Linq-Embed-Mistral']"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "last_hidden_state = model(**inputs, return_dict=True).last_hidden_state\n",
-    "last_hidden_state.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here we implement the pooling function, with two choices of using `[CLS]`'s last hidden state, or the mean pooling of the whole last hidden state."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def pooling(last_hidden_state: torch.Tensor, pooling_method='cls', attention_mask: torch.Tensor = None):\n",
-    "    if pooling_method == 'cls':\n",
-    "        return last_hidden_state[:, 0]\n",
-    "    elif pooling_method == 'mean':\n",
-    "        s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)\n",
-    "        d = attention_mask.sum(dim=1, keepdim=True).float()\n",
-    "        return s / d"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Different from more commonly used mean pooling, BGE is trained to use the last hidden state of `[CLS]` as the sentence embedding: \n",
+    "from FlagEmbedding.inference.embedder.model_mapping import AUTO_EMBEDDER_MAPPING\n",
     "\n",
-    "`sentence_embeddings = model_output[0][:, 0]`\n",
-    "\n",
-    "If you use mean pooling, there will be a significant decrease in performance. Therefore, make sure to use the correct method to obtain sentence vectors."
+    "list(AUTO_EMBEDDER_MAPPING.keys())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "torch.Size([2, 768])"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "EmbedderConfig(model_class=<class 'FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder'>, pooling_method=<PoolingMethod.LAST_TOKEN: 'last_token'>, trust_remote_code=False, query_instruction_format='<instruct>{}\\n<query>{}')\n"
+     ]
     }
    ],
    "source": [
-    "embeddings = pooling(\n",
-    "    last_hidden_state, \n",
-    "    pooling_method='cls', \n",
-    "    attention_mask=inputs['attention_mask']\n",
-    ")\n",
-    "embeddings.shape"
+    "print(AUTO_EMBEDDER_MAPPING['bge-en-icl'])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Assembling them together, we get the whole encoding function:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _encode(sentences, max_length=512, convert_to_numpy=True):\n",
-    "\n",
-    "    # handle the case of single sentence and a list of sentences\n",
-    "    input_was_string = False\n",
-    "    if isinstance(sentences, str):\n",
-    "        sentences = [sentences]\n",
-    "        input_was_string = True\n",
-    "\n",
-    "    inputs = tokenizer(\n",
-    "        sentences, \n",
-    "        padding=True, \n",
-    "        truncation=True, \n",
-    "        return_tensors='pt', \n",
-    "        max_length=max_length\n",
-    "    )\n",
-    "\n",
-    "    last_hidden_state = model(**inputs, return_dict=True).last_hidden_state\n",
-    "    \n",
-    "    embeddings = pooling(\n",
-    "        last_hidden_state, \n",
-    "        pooling_method='cls', \n",
-    "        attention_mask=inputs['attention_mask']\n",
-    "    )\n",
-    "\n",
-    "    # normalize the embedding vectors\n",
-    "    embeddings = torch.nn.functional.normalize(embeddings, dim=-1)\n",
-    "\n",
-    "    # convert to numpy if needed\n",
-    "    if convert_to_numpy:\n",
-    "        embeddings = embeddings.detach().numpy()\n",
-    "\n",
-    "    return embeddings[0] if input_was_string else embeddings"
+    "Taking a look at the value of each key, which is an object of `EmbedderConfig`. It consists four attributes:"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2. Comparison"
+    "```python\n",
+    "@dataclass\n",
+    "class EmbedderConfig:\n",
+    "    model_class: Type[AbsEmbedder]\n",
+    "    pooling_method: PoolingMethod\n",
+    "    trust_remote_code: bool = False\n",
+    "    query_instruction_format: str = \"{}{}\"\n",
+    "```"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now let's run the function we wrote to get the embeddings of the two sentences:"
+    "Not only the BGE series, it supports other models such as E5 similarly:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Embeddings:\n",
-      "[[ 1.4549762e-02 -9.6840411e-03  3.7761475e-03 ... -8.5092714e-04\n",
-      "   2.8417887e-02  6.3214332e-02]\n",
-      " [ 3.3924331e-05 -3.2998275e-03  1.7206438e-02 ...  3.5703944e-03\n",
-      "   1.8721525e-02 -2.0371782e-02]]\n",
-      "Similarity scores:\n",
-      "[[0.9999997 0.6077381]\n",
-      " [0.6077381 0.9999999]]\n"
+      "EmbedderConfig(model_class=<class 'FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder'>, pooling_method=<PoolingMethod.LAST_TOKEN: 'last_token'>, trust_remote_code=False, query_instruction_format='<instruct>{}\\n<query>{}')\n"
      ]
     }
    ],
    "source": [
-    "embeddings = _encode(sentences)\n",
-    "print(f\"Embeddings:\\n{embeddings}\")\n",
-    "\n",
-    "scores = embeddings @ embeddings.T\n",
-    "print(f\"Similarity scores:\\n{scores}\")"
+    "print(AUTO_EMBEDDER_MAPPING['bge-en-icl'])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Then, run the API provided in FlagEmbedding:"
+    "## 3. Customization"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 27,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Embeddings:\n",
-      "[[ 1.4549762e-02 -9.6840411e-03  3.7761475e-03 ... -8.5092714e-04\n",
-      "   2.8417887e-02  6.3214332e-02]\n",
-      " [ 3.3924331e-05 -3.2998275e-03  1.7206438e-02 ...  3.5703944e-03\n",
-      "   1.8721525e-02 -2.0371782e-02]]\n",
-      "Similarity scores:\n",
-      "[[0.9999997 0.6077381]\n",
-      " [0.6077381 0.9999999]]\n"
-     ]
-    }
-   ],
    "source": [
-    "from FlagEmbedding import FlagModel\n",
-    "\n",
-    "model = FlagModel('BAAI/bge-base-en-v1.5')\n",
+    "If you want to use your own models through `FlagAutoModel`, consider the following steps:\n",
     "\n",
-    "embeddings = model.encode(sentences)\n",
-    "print(f\"Embeddings:\\n{embeddings}\")\n",
+    "1. Check the type of your embedding model and choose the appropriate model class, is it an encoder or a decoder?\n",
+    "2. What kind of pooling method it uses? CLS token, mean pooling, or last token?\n",
+    "3. Does your model needs `trust_remote_code=Ture` to ran?\n",
+    "4. Is there a query instruction format for retrieval?\n",
     "\n",
-    "scores = embeddings @ embeddings.T\n",
-    "print(f\"Similarity scores:\\n{scores}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details."
+    "After these four attributes are assured, add your model name as the key and corresponding EmbedderConfig as the value to `MODEL_MAPPING`. Now have a try!"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "dev",
    "language": "python",
    "name": "python3"
   },
@@ -411,7 +269,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/source/tutorial/1_Embedding/1.2.3.ipynb b/docs/source/tutorial/1_Embedding/1.2.3.ipynb
index b691f499..29182377 100644
--- a/docs/source/tutorial/1_Embedding/1.2.3.ipynb
+++ b/docs/source/tutorial/1_Embedding/1.2.3.ipynb
@@ -4,7 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# BGE-M3"
+    "# BGE Explanation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this section, we will go through BGE and BGE-v1.5's structure and how they generate embeddings."
    ]
   },
   {
@@ -23,376 +30,374 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
     "%%capture\n",
-    "%pip install -U transformers FlagEmbedding accelerate"
+    "%pip install -U transformers FlagEmbedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Encode sentences"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 1. BGE-M3 structure"
+    "To know how exactly a sentence is encoded, let's first load the tokenizer and model from HF transformers instead of FlagEmbedding"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
     "from transformers import AutoTokenizer, AutoModel\n",
-    "import torch, os\n",
+    "import torch\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-base-en-v1.5\")\n",
+    "model = AutoModel.from_pretrained(\"BAAI/bge-base-en-v1.5\")\n",
     "\n",
-    "tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-m3\")\n",
-    "raw_model = AutoModel.from_pretrained(\"BAAI/bge-m3\")"
+    "sentences = [\"embedding\", \"I love machine learning and nlp\"]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The base model of BGE-M3 is [XLM-RoBERTa-large](https://huggingface.co/FacebookAI/xlm-roberta-large), which is a multilingual version of RoBERTa."
+    "Run the following cell to check the model of bge-base-en-v1.5. It uses BERT-base as base model, with 12 encoder layers and hidden dimension of 768.\n",
+    "\n",
+    "Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "XLMRobertaModel(\n",
-       "  (embeddings): XLMRobertaEmbeddings(\n",
-       "    (word_embeddings): Embedding(250002, 1024, padding_idx=1)\n",
-       "    (position_embeddings): Embedding(8194, 1024, padding_idx=1)\n",
-       "    (token_type_embeddings): Embedding(1, 1024)\n",
-       "    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "BertModel(\n",
+       "  (embeddings): BertEmbeddings(\n",
+       "    (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
+       "    (position_embeddings): Embedding(512, 768)\n",
+       "    (token_type_embeddings): Embedding(2, 768)\n",
+       "    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
        "    (dropout): Dropout(p=0.1, inplace=False)\n",
        "  )\n",
-       "  (encoder): XLMRobertaEncoder(\n",
+       "  (encoder): BertEncoder(\n",
        "    (layer): ModuleList(\n",
-       "      (0-23): 24 x XLMRobertaLayer(\n",
-       "        (attention): XLMRobertaAttention(\n",
-       "          (self): XLMRobertaSelfAttention(\n",
-       "            (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "            (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "            (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "      (0-11): 12 x BertLayer(\n",
+       "        (attention): BertAttention(\n",
+       "          (self): BertSelfAttention(\n",
+       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
        "            (dropout): Dropout(p=0.1, inplace=False)\n",
        "          )\n",
-       "          (output): XLMRobertaSelfOutput(\n",
-       "            (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "          (output): BertSelfOutput(\n",
+       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
        "            (dropout): Dropout(p=0.1, inplace=False)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): XLMRobertaIntermediate(\n",
-       "          (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
+       "        (intermediate): BertIntermediate(\n",
+       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
        "          (intermediate_act_fn): GELUActivation()\n",
        "        )\n",
-       "        (output): XLMRobertaOutput(\n",
-       "          (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
-       "          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "        (output): BertOutput(\n",
+       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
        "          (dropout): Dropout(p=0.1, inplace=False)\n",
        "        )\n",
        "      )\n",
        "    )\n",
        "  )\n",
-       "  (pooler): XLMRobertaPooler(\n",
-       "    (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "  (pooler): BertPooler(\n",
+       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
        "    (activation): Tanh()\n",
        "  )\n",
        ")"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "raw_model.eval()"
+    "model.eval()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2. Multi-Functionality"
+    "First, let's tokenize the sentences."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 240131.91it/s]\n"
-     ]
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([[  101,  7861,  8270,  4667,   102,     0,     0,     0,     0],\n",
+       "        [  101,  1045,  2293,  3698,  4083,  1998, 17953,  2361,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0],\n",
+       "        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "from FlagEmbedding import BGEM3FlagModel\n",
-    "\n",
-    "model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)\n",
-    "\n",
-    "sentences_1 = [\"What is BGE M3?\", \"Defination of BM25\"]\n",
-    "sentences_2 = [\"BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.\", \n",
-    "               \"BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.1 Dense Retrieval"
+    "inputs = tokenizer(\n",
+    "    sentences, \n",
+    "    padding=True, \n",
+    "    truncation=True, \n",
+    "    return_tensors='pt', \n",
+    "    max_length=512\n",
+    ")\n",
+    "inputs"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Using BGE M3 for dense embedding has similar steps to BGE or BGE 1.5 models.\n",
-    "\n",
-    "Use the normalized hidden state of the special token [CLS] as the embedding:\n",
-    "\n",
-    "$$e_q = norm(H_q[0])$$\n",
-    "\n",
-    "Then compute the relevance score between the query and passage:\n",
-    "\n",
-    "$$s_{dense}=f_{sim}(e_p, e_q)$$\n",
-    "\n",
-    "where $e_p, e_q$ are the embedding vectors of passage and query, respectively.\n",
-    "\n",
-    "$f_{sim}$ is the score function (such as inner product and L2 distance) for comupting two embeddings' similarity."
+    "From the results, we can see that each sentence begins with token 101 and ends with 102, which are the `[CLS]` and `[SEP]` special token used in BERT."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[0.6259035  0.34749585]\n",
-      " [0.349868   0.6782462 ]]\n"
-     ]
+     "data": {
+      "text/plain": [
+       "torch.Size([2, 9, 768])"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "# If you don't need such a long length of 8192 input tokens, you can set max_length to a smaller value to speed up encoding.\n",
-    "embeddings_1 = model.encode(sentences_1, max_length=10)['dense_vecs']\n",
-    "embeddings_2 = model.encode(sentences_2, max_length=100)['dense_vecs']\n",
-    "\n",
-    "# compute the similarity scores\n",
-    "s_dense = embeddings_1 @ embeddings_2.T\n",
-    "print(s_dense)"
+    "last_hidden_state = model(**inputs, return_dict=True).last_hidden_state\n",
+    "last_hidden_state.shape"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 2.2 Sparse Retrieval"
+    "Here we implement the pooling function, with two choices of using `[CLS]`'s last hidden state, or the mean pooling of the whole last hidden state."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pooling(last_hidden_state: torch.Tensor, pooling_method='cls', attention_mask: torch.Tensor = None):\n",
+    "    if pooling_method == 'cls':\n",
+    "        return last_hidden_state[:, 0]\n",
+    "    elif pooling_method == 'mean':\n",
+    "        s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)\n",
+    "        d = attention_mask.sum(dim=1, keepdim=True).float()\n",
+    "        return s / d"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Set `return_sparse` to true to make the model return sparse vector.  If a term token appears multiple times in the sentence, we only retain its max weight.\n",
-    "\n",
-    "BGE-M3 generates sparce embeddings by adding a linear layer and a ReLU activation function following the hidden states:\n",
+    "Different from more commonly used mean pooling, BGE is trained to use the last hidden state of `[CLS]` as the sentence embedding: \n",
     "\n",
-    "$$w_{qt} = \\text{Relu}(W_{lex}^T H_q [i])$$\n",
+    "`sentence_embeddings = model_output[0][:, 0]`\n",
     "\n",
-    "where $W_{lex}$ representes the weights of linear layer and $H_q[i]$ is the encoder's output of the $i^{th}$ token."
+    "If you use mean pooling, there will be a significant decrease in performance. Therefore, make sure to use the correct method to obtain sentence vectors."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[{'What': 0.08362077, 'is': 0.081469566, 'B': 0.12964639, 'GE': 0.25186998, 'M': 0.17001738, '3': 0.26957875, '?': 0.040755156}, {'De': 0.050144322, 'fin': 0.13689369, 'ation': 0.045134712, 'of': 0.06342201, 'BM': 0.25167602, '25': 0.33353207}]\n"
-     ]
+     "data": {
+      "text/plain": [
+       "torch.Size([2, 768])"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "output_1 = model.encode(sentences_1, return_sparse=True)\n",
-    "output_2 = model.encode(sentences_2, return_sparse=True)\n",
-    "\n",
-    "# you can see the weight for each token:\n",
-    "print(model.convert_id_to_token(output_1['lexical_weights']))"
+    "embeddings = pooling(\n",
+    "    last_hidden_state, \n",
+    "    pooling_method='cls', \n",
+    "    attention_mask=inputs['attention_mask']\n",
+    ")\n",
+    "embeddings.shape"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Based on the tokens' weights of query and passage, the relevance score between them is computed by the joint importance of the co-existed terms within the query and passage:\n",
-    "\n",
-    "$$s_{lex} = \\sum_{t\\in q\\cap p}(w_{qt} * w_{pt})$$\n",
-    "\n",
-    "where $w_{qt}, w_{pt}$ are the importance weights of each co-existed term $t$ in query and passage, respectively."
+    "Assembling them together, we get the whole encoding function:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 25,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.19554448500275612\n",
-      "0.00880391988903284\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# compute the scores via lexical mathcing\n",
-    "s_lex_10_20 = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][0])\n",
-    "s_lex_10_21 = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][1])\n",
+    "def _encode(sentences, max_length=512, convert_to_numpy=True):\n",
+    "\n",
+    "    # handle the case of single sentence and a list of sentences\n",
+    "    input_was_string = False\n",
+    "    if isinstance(sentences, str):\n",
+    "        sentences = [sentences]\n",
+    "        input_was_string = True\n",
+    "\n",
+    "    inputs = tokenizer(\n",
+    "        sentences, \n",
+    "        padding=True, \n",
+    "        truncation=True, \n",
+    "        return_tensors='pt', \n",
+    "        max_length=max_length\n",
+    "    )\n",
+    "\n",
+    "    last_hidden_state = model(**inputs, return_dict=True).last_hidden_state\n",
+    "    \n",
+    "    embeddings = pooling(\n",
+    "        last_hidden_state, \n",
+    "        pooling_method='cls', \n",
+    "        attention_mask=inputs['attention_mask']\n",
+    "    )\n",
     "\n",
-    "print(s_lex_10_20)\n",
-    "print(s_lex_10_21)"
+    "    # normalize the embedding vectors\n",
+    "    embeddings = torch.nn.functional.normalize(embeddings, dim=-1)\n",
+    "\n",
+    "    # convert to numpy if needed\n",
+    "    if convert_to_numpy:\n",
+    "        embeddings = embeddings.detach().numpy()\n",
+    "\n",
+    "    return embeddings[0] if input_was_string else embeddings"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 2.3 Multi-Vector"
+    "## 2. Comparison"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The multi-vector method utilizes the entire output embeddings for the representation of query $E_q$ and passage $E_p$.\n",
-    "\n",
-    "$$E_q = norm(W_{mul}^T H_q)$$\n",
-    "$$E_p = norm(W_{mul}^T H_p)$$\n",
-    "\n",
-    "where $W_{mul}$ is the learnable projection matrix."
+    "Now let's run the function we wrote to get the embeddings of the two sentences:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(8, 1024)\n",
-      "(30, 1024)\n"
+      "Embeddings:\n",
+      "[[ 1.4549762e-02 -9.6840411e-03  3.7761475e-03 ... -8.5092714e-04\n",
+      "   2.8417887e-02  6.3214332e-02]\n",
+      " [ 3.3924331e-05 -3.2998275e-03  1.7206438e-02 ...  3.5703944e-03\n",
+      "   1.8721525e-02 -2.0371782e-02]]\n",
+      "Similarity scores:\n",
+      "[[0.9999997 0.6077381]\n",
+      " [0.6077381 0.9999999]]\n"
      ]
     }
    ],
    "source": [
-    "output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True)\n",
-    "output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=True)\n",
+    "embeddings = _encode(sentences)\n",
+    "print(f\"Embeddings:\\n{embeddings}\")\n",
     "\n",
-    "print(f\"({len(output_1['colbert_vecs'][0])}, {len(output_1['colbert_vecs'][0][0])})\")\n",
-    "print(f\"({len(output_2['colbert_vecs'][0])}, {len(output_2['colbert_vecs'][0][0])})\")"
+    "scores = embeddings @ embeddings.T\n",
+    "print(f\"Similarity scores:\\n{scores}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Following ColBert, we use late-interaction to compute the fine-grained relevance score:\n",
-    "\n",
-    "$$s_{mul}=\\frac{1}{N}\\sum_{i=1}^N\\max_{j=1}^M E_q[i]\\cdot E_p^T[j]$$\n",
-    "\n",
-    "where $E_q, E_p$ are the entire output embeddings of query and passage, respectively.\n",
-    "\n",
-    "This is a summation of average of maximum similarity of each $v\\in E_q$ with vectors in $E_p$"
+    "Then, run the API provided in FlagEmbedding:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.7796662449836731\n",
-      "0.4621177911758423\n"
+      "Embeddings:\n",
+      "[[ 1.4549762e-02 -9.6840411e-03  3.7761475e-03 ... -8.5092714e-04\n",
+      "   2.8417887e-02  6.3214332e-02]\n",
+      " [ 3.3924331e-05 -3.2998275e-03  1.7206438e-02 ...  3.5703944e-03\n",
+      "   1.8721525e-02 -2.0371782e-02]]\n",
+      "Similarity scores:\n",
+      "[[0.9999997 0.6077381]\n",
+      " [0.6077381 0.9999999]]\n"
      ]
     }
    ],
    "source": [
-    "s_mul_10_20 = model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][0]).item()\n",
-    "s_mul_10_21 = model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][1]).item()\n",
+    "from FlagEmbedding import FlagModel\n",
     "\n",
-    "print(s_mul_10_20)\n",
-    "print(s_mul_10_21)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2.4 Hybrid Ranking"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "BGE-M3's multi-functionality gives the possibility of hybrid ranking to improve retrieval. Firstly, due to the heavy cost of multi-vector method, we can retrieve the candidate results by either of the dense or sparse method. Then, to get the final result, we can rerank the candidates based on the integrated relevance score:\n",
+    "model = FlagModel('BAAI/bge-base-en-v1.5')\n",
     "\n",
-    "$$s_{rank} = w_1\\cdot s_{dense}+w_2\\cdot s_{lex} + w_3\\cdot s_{mul}$$\n",
+    "embeddings = model.encode(sentences)\n",
+    "print(f\"Embeddings:\\n{embeddings}\")\n",
     "\n",
-    "where the values chosen for $w_1, w_2$ and $w_3$ varies depending on the downstream scenario (here 1/3 is just for demonstration)."
+    "scores = embeddings @ embeddings.T\n",
+    "print(f\"Similarity scores:\\n{scores}\")"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 12,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.5337047390639782\n",
-      "0.27280585498859483\n"
-     ]
-    }
-   ],
    "source": [
-    "s_rank_10_20 = 1/3 * s_dense[0][0] + 1/3 * s_lex_10_20 + 1/3 * s_mul_10_20\n",
-    "s_rank_10_21 = 1/3 * s_dense[0][1] + 1/3 * s_lex_10_21 + 1/3 * s_mul_10_21\n",
-    "\n",
-    "print(s_rank_10_20)\n",
-    "print(s_rank_10_21)"
+    "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/inference/embedder/encoder_only/base.py) for more details."
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "dev",
    "language": "python",
    "name": "python3"
   },
@@ -406,7 +411,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.13.0"
   }
  },
  "nbformat": 4,
diff --git a/docs/source/tutorial/1_Embedding/1.2.4.ipynb b/docs/source/tutorial/1_Embedding/1.2.4.ipynb
new file mode 100644
index 00000000..b691f499
--- /dev/null
+++ b/docs/source/tutorial/1_Embedding/1.2.4.ipynb
@@ -0,0 +1,414 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# BGE-M3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install the required packages in your environment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "%pip install -U transformers FlagEmbedding accelerate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. BGE-M3 structure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "import torch, os\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-m3\")\n",
+    "raw_model = AutoModel.from_pretrained(\"BAAI/bge-m3\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The base model of BGE-M3 is [XLM-RoBERTa-large](https://huggingface.co/FacebookAI/xlm-roberta-large), which is a multilingual version of RoBERTa."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "XLMRobertaModel(\n",
+       "  (embeddings): XLMRobertaEmbeddings(\n",
+       "    (word_embeddings): Embedding(250002, 1024, padding_idx=1)\n",
+       "    (position_embeddings): Embedding(8194, 1024, padding_idx=1)\n",
+       "    (token_type_embeddings): Embedding(1, 1024)\n",
+       "    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  )\n",
+       "  (encoder): XLMRobertaEncoder(\n",
+       "    (layer): ModuleList(\n",
+       "      (0-23): 24 x XLMRobertaLayer(\n",
+       "        (attention): XLMRobertaAttention(\n",
+       "          (self): XLMRobertaSelfAttention(\n",
+       "            (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (key): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (output): XLMRobertaSelfOutput(\n",
+       "            (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (intermediate): XLMRobertaIntermediate(\n",
+       "          (dense): Linear(in_features=1024, out_features=4096, bias=True)\n",
+       "          (intermediate_act_fn): GELUActivation()\n",
+       "        )\n",
+       "        (output): XLMRobertaOutput(\n",
+       "          (dense): Linear(in_features=4096, out_features=1024, bias=True)\n",
+       "          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (pooler): XLMRobertaPooler(\n",
+       "    (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "    (activation): Tanh()\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_model.eval()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Multi-Functionality"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 240131.91it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import BGEM3FlagModel\n",
+    "\n",
+    "model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)\n",
+    "\n",
+    "sentences_1 = [\"What is BGE M3?\", \"Defination of BM25\"]\n",
+    "sentences_2 = [\"BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.\", \n",
+    "               \"BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Dense Retrieval"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Using BGE M3 for dense embedding has similar steps to BGE or BGE 1.5 models.\n",
+    "\n",
+    "Use the normalized hidden state of the special token [CLS] as the embedding:\n",
+    "\n",
+    "$$e_q = norm(H_q[0])$$\n",
+    "\n",
+    "Then compute the relevance score between the query and passage:\n",
+    "\n",
+    "$$s_{dense}=f_{sim}(e_p, e_q)$$\n",
+    "\n",
+    "where $e_p, e_q$ are the embedding vectors of passage and query, respectively.\n",
+    "\n",
+    "$f_{sim}$ is the score function (such as inner product and L2 distance) for comupting two embeddings' similarity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.6259035  0.34749585]\n",
+      " [0.349868   0.6782462 ]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# If you don't need such a long length of 8192 input tokens, you can set max_length to a smaller value to speed up encoding.\n",
+    "embeddings_1 = model.encode(sentences_1, max_length=10)['dense_vecs']\n",
+    "embeddings_2 = model.encode(sentences_2, max_length=100)['dense_vecs']\n",
+    "\n",
+    "# compute the similarity scores\n",
+    "s_dense = embeddings_1 @ embeddings_2.T\n",
+    "print(s_dense)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Sparse Retrieval"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set `return_sparse` to true to make the model return sparse vector.  If a term token appears multiple times in the sentence, we only retain its max weight.\n",
+    "\n",
+    "BGE-M3 generates sparce embeddings by adding a linear layer and a ReLU activation function following the hidden states:\n",
+    "\n",
+    "$$w_{qt} = \\text{Relu}(W_{lex}^T H_q [i])$$\n",
+    "\n",
+    "where $W_{lex}$ representes the weights of linear layer and $H_q[i]$ is the encoder's output of the $i^{th}$ token."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'What': 0.08362077, 'is': 0.081469566, 'B': 0.12964639, 'GE': 0.25186998, 'M': 0.17001738, '3': 0.26957875, '?': 0.040755156}, {'De': 0.050144322, 'fin': 0.13689369, 'ation': 0.045134712, 'of': 0.06342201, 'BM': 0.25167602, '25': 0.33353207}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "output_1 = model.encode(sentences_1, return_sparse=True)\n",
+    "output_2 = model.encode(sentences_2, return_sparse=True)\n",
+    "\n",
+    "# you can see the weight for each token:\n",
+    "print(model.convert_id_to_token(output_1['lexical_weights']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Based on the tokens' weights of query and passage, the relevance score between them is computed by the joint importance of the co-existed terms within the query and passage:\n",
+    "\n",
+    "$$s_{lex} = \\sum_{t\\in q\\cap p}(w_{qt} * w_{pt})$$\n",
+    "\n",
+    "where $w_{qt}, w_{pt}$ are the importance weights of each co-existed term $t$ in query and passage, respectively."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.19554448500275612\n",
+      "0.00880391988903284\n"
+     ]
+    }
+   ],
+   "source": [
+    "# compute the scores via lexical mathcing\n",
+    "s_lex_10_20 = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][0])\n",
+    "s_lex_10_21 = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][1])\n",
+    "\n",
+    "print(s_lex_10_20)\n",
+    "print(s_lex_10_21)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Multi-Vector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The multi-vector method utilizes the entire output embeddings for the representation of query $E_q$ and passage $E_p$.\n",
+    "\n",
+    "$$E_q = norm(W_{mul}^T H_q)$$\n",
+    "$$E_p = norm(W_{mul}^T H_p)$$\n",
+    "\n",
+    "where $W_{mul}$ is the learnable projection matrix."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(8, 1024)\n",
+      "(30, 1024)\n"
+     ]
+    }
+   ],
+   "source": [
+    "output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True)\n",
+    "output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=True)\n",
+    "\n",
+    "print(f\"({len(output_1['colbert_vecs'][0])}, {len(output_1['colbert_vecs'][0][0])})\")\n",
+    "print(f\"({len(output_2['colbert_vecs'][0])}, {len(output_2['colbert_vecs'][0][0])})\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Following ColBert, we use late-interaction to compute the fine-grained relevance score:\n",
+    "\n",
+    "$$s_{mul}=\\frac{1}{N}\\sum_{i=1}^N\\max_{j=1}^M E_q[i]\\cdot E_p^T[j]$$\n",
+    "\n",
+    "where $E_q, E_p$ are the entire output embeddings of query and passage, respectively.\n",
+    "\n",
+    "This is a summation of average of maximum similarity of each $v\\in E_q$ with vectors in $E_p$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.7796662449836731\n",
+      "0.4621177911758423\n"
+     ]
+    }
+   ],
+   "source": [
+    "s_mul_10_20 = model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][0]).item()\n",
+    "s_mul_10_21 = model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][1]).item()\n",
+    "\n",
+    "print(s_mul_10_20)\n",
+    "print(s_mul_10_21)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Hybrid Ranking"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "BGE-M3's multi-functionality gives the possibility of hybrid ranking to improve retrieval. Firstly, due to the heavy cost of multi-vector method, we can retrieve the candidate results by either of the dense or sparse method. Then, to get the final result, we can rerank the candidates based on the integrated relevance score:\n",
+    "\n",
+    "$$s_{rank} = w_1\\cdot s_{dense}+w_2\\cdot s_{lex} + w_3\\cdot s_{mul}$$\n",
+    "\n",
+    "where the values chosen for $w_1, w_2$ and $w_3$ varies depending on the downstream scenario (here 1/3 is just for demonstration)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.5337047390639782\n",
+      "0.27280585498859483\n"
+     ]
+    }
+   ],
+   "source": [
+    "s_rank_10_20 = 1/3 * s_dense[0][0] + 1/3 * s_lex_10_20 + 1/3 * s_mul_10_20\n",
+    "s_rank_10_21 = 1/3 * s_dense[0][1] + 1/3 * s_lex_10_21 + 1/3 * s_mul_10_21\n",
+    "\n",
+    "print(s_rank_10_20)\n",
+    "print(s_rank_10_21)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/1_Embedding/1.2.5.ipynb b/docs/source/tutorial/1_Embedding/1.2.5.ipynb
new file mode 100644
index 00000000..b67d0318
--- /dev/null
+++ b/docs/source/tutorial/1_Embedding/1.2.5.ipynb
@@ -0,0 +1,346 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# BGE-EN-ICL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial, we will go through BGE-EN-ICL, an LLM based embedding model with both strong zero-shot and few-shot embedding capability."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0.Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install the required packages in your environment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -U transformers FlagEmbedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. BGE-EN-ICL structure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  9.94it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "import torch, os\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-en-icl\")\n",
+    "raw_model = AutoModel.from_pretrained(\"BAAI/bge-en-icl\")\n",
+    "\n",
+    "sentences = [\"embedding\", \"I love machine learning and nlp\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Different from the previous BGE embedding models which are encoder only models, BGE-EN-ICL use decoder only LLM, Mistral-7B, as the base model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralModel(\n",
+       "  (embed_tokens): Embedding(32003, 4096)\n",
+       "  (layers): ModuleList(\n",
+       "    (0-31): 32 x MistralDecoderLayer(\n",
+       "      (self_attn): MistralSdpaAttention(\n",
+       "        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "        (rotary_emb): MistralRotaryEmbedding()\n",
+       "      )\n",
+       "      (mlp): MistralMLP(\n",
+       "        (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "        (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "        (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "        (act_fn): SiLU()\n",
+       "      )\n",
+       "      (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)\n",
+       "      (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)\n",
+       "    )\n",
+       "  )\n",
+       "  (norm): MistralRMSNorm((4096,), eps=1e-05)\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_model.eval()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. New Pooling Method"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "BERT-like encoder only networks are considered with strong capacity for representation learning because of their bidirectional attention structure. Some previous work replace unidirectional attention with bidirectional attention during the embedding training phase. But this might creates a mismatch with the model's pre-training design, which could potentially undermine its in-context learning and generative properties.\n",
+    "\n",
+    "Thus BGE-EN-ICL introduces a [EOS] token's output embedding to address this issue."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([[    0,     0,     0,     0,     0,     0,     1, 28643,     2],\n",
+       "        [    1,   315,  2016,  5599,  5168,   304,   307, 12312,     2]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1],\n",
+       "        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs = tokenizer(\n",
+    "    sentences,\n",
+    "    padding=True,\n",
+    "    return_tensors='pt',\n",
+    ")\n",
+    "inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([2, 9, 4096])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "last_hidden_state = raw_model(**inputs, return_dict=True).last_hidden_state\n",
+    "last_hidden_state.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The last token/[EOS] pooling method can be described as:\n",
+    "\n",
+    "Given the tokenized input sequence $T: [\\text{BOS}], t_1, ..., t_N$ is sent into the LLM:\n",
+    "$$h_t = \\text{LLM}(T)[\\text{EOS}]$$\n",
+    "where $h_t$ represents the text embedding taken from the output embedding of the special token [EOS]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def last_token_pool(last_hidden_states: torch.Tensor,\n",
+    "                    attention_mask: torch.Tensor) -> torch.Tensor:\n",
+    "    \n",
+    "    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])\n",
+    "    if left_padding:\n",
+    "        return last_hidden_states[:, -1]\n",
+    "    else:\n",
+    "        sequence_lengths = attention_mask.sum(dim=1) - 1\n",
+    "        batch_size = last_hidden_states.shape[0]\n",
+    "        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([2, 4096])"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "embeddings = last_token_pool(\n",
+    "    last_hidden_state,  \n",
+    "    attention_mask=inputs['attention_mask']\n",
+    ")\n",
+    "embeddings.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. In-Context Learning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "BGE-EN-ICL integrate strong in-context learning of LLM into embedding model while still persisting strong zero-shot embedding capability."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For zero-shot inference, it's exactly same to BGE v1&1.5. For few-shot inference, use the following way:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "examples = [\n",
+    "    {\n",
+    "        'instruct': 'Given a web search query, retrieve relevant passages that answer the query.',\n",
+    "        'query': 'what is a virtual interface',\n",
+    "        'response': \"A virtual interface is a software-defined abstraction that mimics the behavior and characteristics of a physical network interface. It allows multiple logical network connections to share the same physical network interface, enabling efficient utilization of network resources. Virtual interfaces are commonly used in virtualization technologies such as virtual machines and containers to provide network connectivity without requiring dedicated hardware. They facilitate flexible network configurations and help in isolating network traffic for security and management purposes.\"\n",
+    "    },\n",
+    "    {\n",
+    "        'instruct': 'Given a web search query, retrieve relevant passages that answer the query.',\n",
+    "        'query': 'causes of back pain in female for a week',\n",
+    "        'response': \"Back pain in females lasting a week can stem from various factors. Common causes include muscle strain due to lifting heavy objects or improper posture, spinal issues like herniated discs or osteoporosis, menstrual cramps causing referred pain, urinary tract infections, or pelvic inflammatory disease. Pregnancy-related changes can also contribute. Stress and lack of physical activity may exacerbate symptoms. Proper diagnosis by a healthcare professional is crucial for effective treatment and management.\"\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "queries = [\"how much protein should a female eat\", \"summit define\"]\n",
+    "documents = [\n",
+    "    \"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.\",\n",
+    "    \"Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  4.59it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 501.41it/s]\n",
+      "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.6064 0.302 ]\n",
+      " [0.257  0.5366]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import FlagICLModel\n",
+    "\n",
+    "model = FlagICLModel('BAAI/bge-en-icl', \n",
+    "                     examples_for_task=examples,  # set `examples_for_task=None` to use model without examples\n",
+    "                     examples_instruction_format=\"<instruct>{}\\n<query>{}\\n<response>{}\", # specify the format to use examples_for_task\n",
+    "                     devices=[0],\n",
+    "                    )\n",
+    "\n",
+    "embeddings_1 = model.encode_queries(queries)\n",
+    "embeddings_2 = model.encode_corpus(documents)\n",
+    "similarity = embeddings_1 @ embeddings_2.T\n",
+    "\n",
+    "print(similarity)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ft",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/4_Evaluation.rst b/docs/source/tutorial/4_Evaluation.rst
index 403b804d..5c457a99 100644
--- a/docs/source/tutorial/4_Evaluation.rst
+++ b/docs/source/tutorial/4_Evaluation.rst
@@ -9,4 +9,8 @@
    4_Evaluation/4.1.1
    4_Evaluation/4.2.1
    4_Evaluation/4.2.2
+   4_Evaluation/4.2.3
    4_Evaluation/4.3.1
+   4_Evaluation/4.4.1
+   4_Evaluation/4.5.1
+   4_Evaluation/4.5.2
diff --git a/docs/source/tutorial/4_Evaluation/4.2.3.ipynb b/docs/source/tutorial/4_Evaluation/4.2.3.ipynb
new file mode 100644
index 00000000..5832680f
--- /dev/null
+++ b/docs/source/tutorial/4_Evaluation/4.2.3.ipynb
@@ -0,0 +1,240 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# C-MTEB"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "C-MTEB is the largest benchmark for Chinese text embeddings, similar to MTEB. In this tutorial, we will go through how to evaluate an embedding model's ability on Chinese tasks in C-MTEB."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First install dependent packages:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install FlagEmbedding mteb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Datasets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "C-MTEB uses similar task splits and metrics as English MTEB. It contains 35 datasets in 6 different tasks: Classification, Clustering, Pair Classification, Reranking, Retrieval, and Semantic Textual Similarity (STS). \n",
+    "\n",
+    "1. **Classification**: Use the embeddings to train a logistic regression on the train set and is scored on the test set. F1 is the main metric.\n",
+    "2. **Clustering**: Train a mini-batch k-means model with batch size 32 and k equals to the number of different labels. Then score using v-measure.\n",
+    "3. **Pair Classification**: A pair of text inputs is provided and a label which is a binary variable needs to be assigned. The main metric is average precision score.\n",
+    "4. **Reranking**: Rank a list of relevant and irrelevant reference texts according to a query. Metrics are mean MRR@k and MAP.\n",
+    "5. **Retrieval**: Each dataset comprises corpus, queries, and a mapping that links each query to its relevant documents within the corpus. The goal is to retrieve relevant documents for each query. The main metric is nDCG@k. MTEB directly adopts BEIR for the retrieval task.\n",
+    "6. **Semantic Textual Similarity (STS)**: Determine the similarity between each sentence pair. Spearman correlation based on cosine\n",
+    "similarity serves as the main metric.\n",
+    "\n",
+    "\n",
+    "Check the [HF page](https://huggingface.co/C-MTEB) for the details of each dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ChineseTaskList = [\n",
+    "    'TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'OnlineShopping', 'Waimai',\n",
+    "    'CLSClusteringS2S.v2', 'CLSClusteringP2P.v2', 'ThuNewsClusteringS2S.v2', 'ThuNewsClusteringP2P.v2',\n",
+    "    'Ocnli', 'Cmnli',\n",
+    "    'T2Reranking', 'MMarcoReranking', 'CMedQAv1-reranking', 'CMedQAv2-reranking',\n",
+    "    'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',\n",
+    "    'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC'\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, load the model for evaluation. Note that the instruction here is used for retreival tasks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ...C_MTEB.flag_dres_model import FlagDRESModel\n",
+    "\n",
+    "instruction = \"为这个句子生成表示以用于检索相关文章：\"\n",
+    "model_name = \"BAAI/bge-base-zh-v1.5\"\n",
+    "\n",
+    "model = FlagDRESModel(model_name_or_path=\"BAAI/bge-base-zh-v1.5\",\n",
+    "                      query_instruction_for_retrieval=instruction,\n",
+    "                      pooling_method=\"cls\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Otherwise, you can load a model using sentence_transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "model = SentenceTransformer(\"PATH_TO_MODEL\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Or implement a class following the structure below:\n",
+    "\n",
+    "```python\n",
+    "class MyModel():\n",
+    "    def __init__(self):\n",
+    "        \"\"\"initialize the tokenizer and model\"\"\"\n",
+    "        pass\n",
+    "\n",
+    "    def encode(self, sentences, batch_size=32, **kwargs):\n",
+    "        \"\"\" Returns a list of embeddings for the given sentences.\n",
+    "        Args:\n",
+    "            sentences (`List[str]`): List of sentences to encode\n",
+    "            batch_size (`int`): Batch size for the encoding\n",
+    "\n",
+    "        Returns:\n",
+    "            `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences\n",
+    "        \"\"\"\n",
+    "        pass\n",
+    "\n",
+    "model = MyModel()\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Evaluate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After we've prepared the dataset and model, we can start the evaluation. For time efficiency, we highly recommend to use GPU for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mteb\n",
+    "from mteb import MTEB\n",
+    "\n",
+    "tasks = mteb.get_tasks(ChineseTaskList)\n",
+    "\n",
+    "for task in tasks:\n",
+    "    evaluation = MTEB(tasks=[task])\n",
+    "    evaluation.run(model, output_folder=f\"zh_results/{model_name.split('/')[-1]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Submit to MTEB Leaderboard"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the evaluation is done, all the evaluation results should be stored in `zh_results/{model_name}/`.\n",
+    "\n",
+    "Then run the following shell command to create the model_card.md. Change {model_name} and its following to your path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!!mteb create_meta --results_folder results/{model_name}/ --output_path model_card.md"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Copy and paste the contents of model_card.md to the top of README.md of your model on HF Hub. Then goto the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard) and choose the Chinese leaderboard to find your model! It will appear soon after the website's daily refresh."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/4_Evaluation/4.3.1.ipynb b/docs/source/tutorial/4_Evaluation/4.3.1.ipynb
index 5832680f..62bb7f30 100644
--- a/docs/source/tutorial/4_Evaluation/4.3.1.ipynb
+++ b/docs/source/tutorial/4_Evaluation/4.3.1.ipynb
@@ -4,14 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# C-MTEB"
+    "# Evaluation Using Sentence Transformers"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "C-MTEB is the largest benchmark for Chinese text embeddings, similar to MTEB. In this tutorial, we will go through how to evaluate an embedding model's ability on Chinese tasks in C-MTEB."
+    "In this tutorial, we will go through how to use the Sentence Tranformers library to do evaluation."
    ]
   },
   {
@@ -21,45 +21,13 @@
     "## 0. Installation"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "First install dependent packages:"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install FlagEmbedding mteb"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Datasets"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "C-MTEB uses similar task splits and metrics as English MTEB. It contains 35 datasets in 6 different tasks: Classification, Clustering, Pair Classification, Reranking, Retrieval, and Semantic Textual Similarity (STS). \n",
-    "\n",
-    "1. **Classification**: Use the embeddings to train a logistic regression on the train set and is scored on the test set. F1 is the main metric.\n",
-    "2. **Clustering**: Train a mini-batch k-means model with batch size 32 and k equals to the number of different labels. Then score using v-measure.\n",
-    "3. **Pair Classification**: A pair of text inputs is provided and a label which is a binary variable needs to be assigned. The main metric is average precision score.\n",
-    "4. **Reranking**: Rank a list of relevant and irrelevant reference texts according to a query. Metrics are mean MRR@k and MAP.\n",
-    "5. **Retrieval**: Each dataset comprises corpus, queries, and a mapping that links each query to its relevant documents within the corpus. The goal is to retrieve relevant documents for each query. The main metric is nDCG@k. MTEB directly adopts BEIR for the retrieval task.\n",
-    "6. **Semantic Textual Similarity (STS)**: Determine the similarity between each sentence pair. Spearman correlation based on cosine\n",
-    "similarity serves as the main metric.\n",
-    "\n",
-    "\n",
-    "Check the [HF page](https://huggingface.co/C-MTEB) for the details of each dataset."
+    "%pip install -U sentence-transformers"
    ]
   },
   {
@@ -68,28 +36,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ChineseTaskList = [\n",
-    "    'TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'OnlineShopping', 'Waimai',\n",
-    "    'CLSClusteringS2S.v2', 'CLSClusteringP2P.v2', 'ThuNewsClusteringS2S.v2', 'ThuNewsClusteringP2P.v2',\n",
-    "    'Ocnli', 'Cmnli',\n",
-    "    'T2Reranking', 'MMarcoReranking', 'CMedQAv1-reranking', 'CMedQAv2-reranking',\n",
-    "    'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',\n",
-    "    'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC'\n",
-    "]"
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "# Load a model\n",
+    "model = SentenceTransformer('all-MiniLM-L6-v2')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2. Model"
+    "## 1. Retrieval"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, load the model for evaluation. Note that the instruction here is used for retreival tasks."
+    "Let's choose retrieval as the first task"
    ]
   },
   {
@@ -98,21 +62,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ...C_MTEB.flag_dres_model import FlagDRESModel\n",
+    "import random\n",
     "\n",
-    "instruction = \"为这个句子生成表示以用于检索相关文章：\"\n",
-    "model_name = \"BAAI/bge-base-zh-v1.5\"\n",
+    "from sentence_transformers.evaluation import InformationRetrievalEvaluator\n",
     "\n",
-    "model = FlagDRESModel(model_name_or_path=\"BAAI/bge-base-zh-v1.5\",\n",
-    "                      query_instruction_for_retrieval=instruction,\n",
-    "                      pooling_method=\"cls\")"
+    "from datasets import load_dataset"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Otherwise, you can load a model using sentence_transformers:"
+    "BeIR is a well known benchmark for retrieval. Let's use the xxx dataset for our evaluation."
    ]
   },
   {
@@ -121,50 +82,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sentence_transformers import SentenceTransformer\n",
-    "\n",
-    "model = SentenceTransformer(\"PATH_TO_MODEL\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Or implement a class following the structure below:\n",
-    "\n",
-    "```python\n",
-    "class MyModel():\n",
-    "    def __init__(self):\n",
-    "        \"\"\"initialize the tokenizer and model\"\"\"\n",
-    "        pass\n",
-    "\n",
-    "    def encode(self, sentences, batch_size=32, **kwargs):\n",
-    "        \"\"\" Returns a list of embeddings for the given sentences.\n",
-    "        Args:\n",
-    "            sentences (`List[str]`): List of sentences to encode\n",
-    "            batch_size (`int`): Batch size for the encoding\n",
-    "\n",
-    "        Returns:\n",
-    "            `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences\n",
-    "        \"\"\"\n",
-    "        pass\n",
-    "\n",
-    "model = MyModel()\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. Evaluate"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "After we've prepared the dataset and model, we can start the evaluation. For time efficiency, we highly recommend to use GPU for evaluation."
+    "# Load the Quora IR dataset (https://huggingface.co/datasets/BeIR/quora, https://huggingface.co/datasets/BeIR/quora-qrels)\n",
+    "corpus = load_dataset(\"BeIR/quora\", \"corpus\", split=\"corpus\")\n",
+    "queries = load_dataset(\"BeIR/quora\", \"queries\", split=\"queries\")\n",
+    "relevant_docs_data = load_dataset(\"BeIR/quora-qrels\", split=\"validation\")"
    ]
   },
   {
@@ -173,30 +94,28 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import mteb\n",
-    "from mteb import MTEB\n",
+    "# Shrink the corpus size heavily to only the relevant documents + 10,000 random documents\n",
+    "required_corpus_ids = list(map(str, relevant_docs_data[\"corpus-id\"]))\n",
+    "required_corpus_ids += random.sample(corpus[\"_id\"], k=10_000)\n",
+    "corpus = corpus.filter(lambda x: x[\"_id\"] in required_corpus_ids)\n",
     "\n",
-    "tasks = mteb.get_tasks(ChineseTaskList)\n",
-    "\n",
-    "for task in tasks:\n",
-    "    evaluation = MTEB(tasks=[task])\n",
-    "    evaluation.run(model, output_folder=f\"zh_results/{model_name.split('/')[-1]}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 4. Submit to MTEB Leaderboard"
+    "# Convert the datasets to dictionaries\n",
+    "corpus = dict(zip(corpus[\"_id\"], corpus[\"text\"]))  # Our corpus (cid => document)\n",
+    "queries = dict(zip(queries[\"_id\"], queries[\"text\"]))  # Our queries (qid => question)\n",
+    "relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])\n",
+    "for qid, corpus_ids in zip(relevant_docs_data[\"query-id\"], relevant_docs_data[\"corpus-id\"]):\n",
+    "    qid = str(qid)\n",
+    "    corpus_ids = str(corpus_ids)\n",
+    "    if qid not in relevant_docs:\n",
+    "        relevant_docs[qid] = set()\n",
+    "    relevant_docs[qid].add(corpus_ids)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After the evaluation is done, all the evaluation results should be stored in `zh_results/{model_name}/`.\n",
-    "\n",
-    "Then run the following shell command to create the model_card.md. Change {model_name} and its following to your path."
+    "Finally we are ready to do the evaluation."
    ]
   },
   {
@@ -205,14 +124,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!!mteb create_meta --results_folder results/{model_name}/ --output_path model_card.md"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Copy and paste the contents of model_card.md to the top of README.md of your model on HF Hub. Then goto the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard) and choose the Chinese leaderboard to find your model! It will appear soon after the website's daily refresh."
+    "# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR metrics.\n",
+    "ir_evaluator = InformationRetrievalEvaluator(\n",
+    "    queries=queries,\n",
+    "    corpus=corpus,\n",
+    "    relevant_docs=relevant_docs,\n",
+    "    name=\"BeIR-quora-dev\",\n",
+    ")\n",
+    "\n",
+    "results = ir_evaluator(model)"
    ]
   }
  ],
@@ -223,15 +143,7 @@
    "name": "python3"
   },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
    "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
    "version": "3.12.2"
   }
  },
diff --git a/docs/source/tutorial/4_Evaluation/4.4.1.ipynb b/docs/source/tutorial/4_Evaluation/4.4.1.ipynb
new file mode 100644
index 00000000..faac629c
--- /dev/null
+++ b/docs/source/tutorial/4_Evaluation/4.4.1.ipynb
@@ -0,0 +1,467 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate on BEIR"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[BEIR](https://github.com/beir-cellar/beir) (Benchmarking-IR) is a heterogeneous evaluation benchmark for information retrieval. \n",
+    "It is designed for evaluating the performance of NLP-based retrieval models and widely used by research of modern embedding models."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First install the libraries we are using:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "% pip install beir FlagEmbedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Evaluate using BEIR"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "BEIR contains 18 datasets which can be downloaded from the [link](https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/), while 4 of them are private datasets that need appropriate licences. If you want to access to those 4 datasets, take a look at their [wiki](https://github.com/beir-cellar/beir/wiki/Datasets-available) for more information. Information collected and codes adapted from BEIR GitHub [repo](https://github.com/beir-cellar/beir)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "| Dataset Name | Type     |  Queries  | Documents | Avg. Docs/Q | Public | \n",
+    "| ---------| :-----------: | ---------| --------- | ------| :------------:| \n",
+    "| ``msmarco`` | `Train` `Dev` `Test` | 6,980   |  8.84M     |    1.1 | Yes |  \n",
+    "| ``trec-covid``| `Test` | 50|  171K| 493.5 | Yes | \n",
+    "| ``nfcorpus``  | `Train` `Dev` `Test` |  323     |  3.6K     |  38.2 | Yes |\n",
+    "| ``bioasq``| `Train` `Test` |    500    |  14.91M    |  8.05 | No | \n",
+    "| ``nq``| `Train` `Test`   |  3,452   |  2.68M  |  1.2 | Yes | \n",
+    "| ``hotpotqa``| `Train` `Dev` `Test`   |  7,405   |  5.23M  |  2.0 | Yes |\n",
+    "| ``fiqa``    | `Train` `Dev` `Test`     |  648     |  57K    |  2.6 | Yes | \n",
+    "| ``signal1m`` | `Test`     |   97   |  2.86M  |  19.6 | No |\n",
+    "| ``trec-news``    | `Test`     |   57    |  595K    |  19.6 | No |\n",
+    "| ``arguana`` | `Test`       |  1,406     |  8.67K    |  1.0 | Yes |\n",
+    "| ``webis-touche2020``| `Test` |   49     |  382K    |  49.2 |  Yes |\n",
+    "| ``cqadupstack``| `Test`      |   13,145 |  457K  |  1.4 |  Yes |\n",
+    "| ``quora``| `Dev` `Test`  |   10,000     |  523K    |  1.6 |  Yes | \n",
+    "| ``dbpedia-entity``| `Dev` `Test` |   400    |  4.63M    |  38.2 |  Yes | \n",
+    "| ``scidocs``| `Test` |    1,000     |  25K    |  4.9 |  Yes | \n",
+    "| ``fever``| `Train` `Dev` `Test`     |   6,666     |  5.42M    |  1.2|  Yes | \n",
+    "| ``climate-fever``| `Test` |  1,535     |  5.42M |  3.0 |  Yes |\n",
+    "| ``scifact``| `Train` `Test` |  300     |  5K    |  1.1 |  Yes |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.1 Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First prepare the logging setup."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "from beir import LoggingHandler\n",
+    "\n",
+    "logging.basicConfig(format='%(message)s',\n",
+    "                    level=logging.INFO,\n",
+    "                    handlers=[LoggingHandler()])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this demo, we choose the `arguana` dataset for a quick demonstration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset downloaded here: /share/project/xzy/Projects/FlagEmbedding/Tutorials/4_Evaluation/data/arguana\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from beir import util\n",
+    "\n",
+    "url = \"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/arguana.zip\"\n",
+    "out_dir = os.path.join(os.getcwd(), \"data\")\n",
+    "data_path = util.download_and_unzip(url, out_dir)\n",
+    "print(f\"Dataset is stored at: {data_path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-11-15 03:54:55,809 - Loading Corpus...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 8674/8674 [00:00<00:00, 158928.31it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-11-15 03:54:55,891 - Loaded 8674 TEST Documents.\n",
+      "2024-11-15 03:54:55,891 - Doc Example: {'text': \"You don’t have to be vegetarian to be green. Many special environments have been created by livestock farming – for example chalk down land in England and mountain pastures in many countries. Ending livestock farming would see these areas go back to woodland with a loss of many unique plants and animals. Growing crops can also be very bad for the planet, with fertilisers and pesticides polluting rivers, lakes and seas. Most tropical forests are now cut down for timber, or to allow oil palm trees to be grown in plantations, not to create space for meat production.  British farmer and former editor Simon Farrell also states: “Many vegans and vegetarians rely on one source from the U.N. calculation that livestock generates 18% of global carbon emissions, but this figure contains basic mistakes. It attributes all deforestation from ranching to cattle, rather than logging or development. It also muddles up one-off emissions from deforestation with on-going pollution.”  He also refutes the statement of meat production inefficiency: “Scientists have calculated that globally the ratio between the amounts of useful plant food used to produce meat is about 5 to 1. If you feed animals only food that humans can eat — which is, indeed, largely the case in the Western world — that may be true. But animals also eat food we can't eat, such as grass. So the real conversion figure is 1.4 to 1.” [1] At the same time eating a vegetarian diet may be no more environmentally friendly than a meat based diet if it is not sustainably sourced or uses perishable fruit and vegetables that are flown in from around the world. Eating locally sourced food can has as big an impact as being vegetarian. [2]  [1] Tara Kelly, Simon Fairlie: How Eating Meat Can Save the World, 12 October 2010  [2] Lucy Siegle, ‘It is time to become a vegetarian?’ The Observer, 18th May 2008\", 'title': 'animals environment general health health general weight philosophy ethics'}\n",
+      "2024-11-15 03:54:55,891 - Loading Queries...\n",
+      "2024-11-15 03:54:55,903 - Loaded 1406 TEST Queries.\n",
+      "2024-11-15 03:54:55,903 - Query Example: Being vegetarian helps the environment  Becoming a vegetarian is an environmentally friendly thing to do. Modern farming is one of the main sources of pollution in our rivers. Beef farming is one of the main causes of deforestation, and as long as people continue to buy fast food in their billions, there will be a financial incentive to continue cutting down trees to make room for cattle. Because of our desire to eat fish, our rivers and seas are being emptied of fish and many species are facing extinction. Energy resources are used up much more greedily by meat farming than my farming cereals, pulses etc. Eating meat and fish not only causes cruelty to animals, it causes serious harm to the environment and to biodiversity. For example consider Meat production related pollution and deforestation  At Toronto’s 1992 Royal Agricultural Winter Fair, Agriculture Canada displayed two contrasting statistics: “it takes four football fields of land (about 1.6 hectares) to feed each Canadian” and “one apple tree produces enough fruit to make 320 pies.” Think about it — a couple of apple trees and a few rows of wheat on a mere fraction of a hectare could produce enough food for one person! [1]  The 2006 U.N. Food and Agriculture Organization (FAO) report concluded that worldwide livestock farming generates 18% of the planet's greenhouse gas emissions — by comparison, all the world's cars, trains, planes and boats account for a combined 13% of greenhouse gas emissions. [2]  As a result of the above point producing meat damages the environment. The demand for meat drives deforestation. Daniel Cesar Avelino of Brazil's Federal Public Prosecution Office says “We know that the single biggest driver of deforestation in the Amazon is cattle.” This clearing of tropical rainforests such as the Amazon for agriculture is estimated to produce 17% of the world's greenhouse gas emissions. [3] Not only this but the production of meat takes a lot more energy than it ultimately gives us chicken meat production consumes energy in a 4:1 ratio to protein output; beef cattle production requires an energy input to protein output ratio of 54:1.  The same is true with water use due to the same phenomenon of meat being inefficient to produce in terms of the amount of grain needed to produce the same weight of meat, production requires a lot of water. Water is another scarce resource that we will soon not have enough of in various areas of the globe. Grain-fed beef production takes 100,000 liters of water for every kilogram of food. Raising broiler chickens takes 3,500 liters of water to make a kilogram of meat. In comparison, soybean production uses 2,000 liters for kilogram of food produced; rice, 1,912; wheat, 900; and potatoes, 500 liters. [4] This is while there are areas of the globe that have severe water shortages. With farming using up to 70 times more water than is used for domestic purposes: cooking and washing. A third of the population of the world is already suffering from a shortage of water. [5] Groundwater levels are falling all over the world and rivers are beginning to dry up. Already some of the biggest rivers such as China’s Yellow river do not reach the sea. [6]  With a rising population becoming vegetarian is the only responsible way to eat.  [1] Stephen Leckie, ‘How Meat-centred Eating Patterns Affect Food Security and the Environment’, International development research center  [2] Bryan Walsh, Meat: Making Global Warming Worse, Time magazine, 10 September 2008 .  [3] David Adam, Supermarket suppliers ‘helping to destroy Amazon rainforest’, The Guardian, 21st June 2009.  [4] Roger Segelken, U.S. could feed 800 million people with grain that livestock eat, Cornell Science News, 7th August 1997.  [5] Fiona Harvey, Water scarcity affects one in three, FT.com, 21st August 2003  [6] Rupert Wingfield-Hayes, Yellow river ‘drying up’, BBC News, 29th July 2004\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from beir.datasets.data_loader import GenericDataLoader\n",
+    "\n",
+    "corpus, queries, qrels = GenericDataLoader(\"data/arguana\").load(split=\"test\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.2 Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we load `bge-base-en-v1.5` from huggingface and evaluate its performance on arguana."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-11-15 04:00:45,253 - Use pytorch device_name: cuda\n",
+      "2024-11-15 04:00:45,254 - Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5\n",
+      "2024-11-15 04:00:48,750 - Encoding Queries...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Batches: 100%|██████████| 11/11 [00:01<00:00,  8.27it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-11-15 04:00:50,177 - Sorting Corpus by document length (Longest first)...\n",
+      "2024-11-15 04:00:50,183 - Encoding Corpus in batches... Warning: This might take a while!\n",
+      "2024-11-15 04:00:50,183 - Scoring Function: Cosine Similarity (cos_sim)\n",
+      "2024-11-15 04:00:50,184 - Encoding Batch 1/1...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Batches: 100%|██████████| 68/68 [00:07<00:00,  9.43it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from beir.retrieval.evaluation import EvaluateRetrieval\n",
+    "from beir.retrieval import models\n",
+    "from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES\n",
+    "\n",
+    "\n",
+    "# Load bge model using Sentence Transformers\n",
+    "model = DRES(models.SentenceBERT(\"BAAI/bge-base-en-v1.5\"), batch_size=128)\n",
+    "retriever = EvaluateRetrieval(model, score_function=\"cos_sim\")\n",
+    "\n",
+    "# Get the searching results\n",
+    "results = retriever.retrieve(corpus, queries)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-11-15 04:00:58,514 - Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]\n",
+      "2024-11-15 04:00:58,514 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.\n",
+      "2024-11-15 04:00:59,184 - \n",
+      "\n",
+      "2024-11-15 04:00:59,188 - NDCG@1: 0.4075\n",
+      "2024-11-15 04:00:59,188 - NDCG@3: 0.5572\n",
+      "2024-11-15 04:00:59,188 - NDCG@5: 0.5946\n",
+      "2024-11-15 04:00:59,188 - NDCG@10: 0.6361\n",
+      "2024-11-15 04:00:59,188 - NDCG@100: 0.6606\n",
+      "2024-11-15 04:00:59,188 - NDCG@1000: 0.6613\n",
+      "2024-11-15 04:00:59,188 - \n",
+      "\n",
+      "2024-11-15 04:00:59,188 - MAP@1: 0.4075\n",
+      "2024-11-15 04:00:59,188 - MAP@3: 0.5193\n",
+      "2024-11-15 04:00:59,188 - MAP@5: 0.5402\n",
+      "2024-11-15 04:00:59,188 - MAP@10: 0.5577\n",
+      "2024-11-15 04:00:59,188 - MAP@100: 0.5634\n",
+      "2024-11-15 04:00:59,188 - MAP@1000: 0.5635\n",
+      "2024-11-15 04:00:59,188 - \n",
+      "\n",
+      "2024-11-15 04:00:59,188 - Recall@1: 0.4075\n",
+      "2024-11-15 04:00:59,188 - Recall@3: 0.6671\n",
+      "2024-11-15 04:00:59,188 - Recall@5: 0.7575\n",
+      "2024-11-15 04:00:59,188 - Recall@10: 0.8841\n",
+      "2024-11-15 04:00:59,188 - Recall@100: 0.9915\n",
+      "2024-11-15 04:00:59,189 - Recall@1000: 0.9964\n",
+      "2024-11-15 04:00:59,189 - \n",
+      "\n",
+      "2024-11-15 04:00:59,189 - P@1: 0.4075\n",
+      "2024-11-15 04:00:59,189 - P@3: 0.2224\n",
+      "2024-11-15 04:00:59,189 - P@5: 0.1515\n",
+      "2024-11-15 04:00:59,189 - P@10: 0.0884\n",
+      "2024-11-15 04:00:59,189 - P@100: 0.0099\n",
+      "2024-11-15 04:00:59,189 - P@1000: 0.0010\n"
+     ]
+    }
+   ],
+   "source": [
+    "logging.info(\"Retriever evaluation for k in: {}\".format(retriever.k_values))\n",
+    "ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Evaluate using FlagEmbedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We provide independent evaluation for popular datasets and benchmarks. Try the following code to run the evaluation, or run the shell script provided in [example](../../examples/evaluation/beir/eval_beir.sh) folder."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the arguments:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "arguments = \"\"\"-\n",
+    "    --eval_name beir \n",
+    "    --dataset_dir ./beir/data \n",
+    "    --dataset_names arguana\n",
+    "    --splits test dev \n",
+    "    --corpus_embd_save_dir ./beir/corpus_embd \n",
+    "    --output_dir ./beir/search_results \n",
+    "    --search_top_k 1000 \n",
+    "    --rerank_top_k 100 \n",
+    "    --cache_path /root/.cache/huggingface/hub \n",
+    "    --overwrite True \n",
+    "    --k_values 10 100 \n",
+    "    --eval_output_method markdown \n",
+    "    --eval_output_path ./beir/beir_eval_results.md \n",
+    "    --eval_metrics ndcg_at_10 recall_at_100 \n",
+    "    --ignore_identical_ids True \n",
+    "    --embedder_name_or_path BAAI/bge-base-en-v1.5 \n",
+    "    --embedder_batch_size 1024\n",
+    "    --devices cuda:4\n",
+    "\"\"\".replace('\\n','')\n",
+    "\n",
+    "sys.argv = arguments.split()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then pass the arguments to HFArgumentParser and run the evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Split 'dev' not found in the dataset. Removing it from the list.\n",
+      "ignore_identical_ids is set to True. This means that the search results will not contain identical ids. Note: Dataset such as MIRACL should NOT set this to True.\n",
+      "pre tokenize: 100%|██████████| 9/9 [00:00<00:00, 16.19it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "Inference Embeddings: 100%|██████████| 9/9 [00:11<00:00,  1.27s/it]\n",
+      "pre tokenize: 100%|██████████| 2/2 [00:00<00:00, 19.54it/s]\n",
+      "Inference Embeddings: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]\n",
+      "Searching: 100%|██████████| 44/44 [00:00<00:00, 208.73it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import HfArgumentParser\n",
+    "\n",
+    "from FlagEmbedding.evaluation.beir import (\n",
+    "    BEIREvalArgs, BEIREvalModelArgs,\n",
+    "    BEIREvalRunner\n",
+    ")\n",
+    "\n",
+    "\n",
+    "parser = HfArgumentParser((\n",
+    "    BEIREvalArgs,\n",
+    "    BEIREvalModelArgs\n",
+    "))\n",
+    "\n",
+    "eval_args, model_args = parser.parse_args_into_dataclasses()\n",
+    "eval_args: BEIREvalArgs\n",
+    "model_args: BEIREvalModelArgs\n",
+    "\n",
+    "runner = BEIREvalRunner(\n",
+    "    eval_args=eval_args,\n",
+    "    model_args=model_args\n",
+    ")\n",
+    "\n",
+    "runner.run()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Take a look at the results and choose the way you prefer!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "    \"arguana-test\": {\n",
+      "        \"ndcg_at_10\": 0.63668,\n",
+      "        \"ndcg_at_100\": 0.66075,\n",
+      "        \"map_at_10\": 0.55801,\n",
+      "        \"map_at_100\": 0.56358,\n",
+      "        \"recall_at_10\": 0.88549,\n",
+      "        \"recall_at_100\": 0.99147,\n",
+      "        \"precision_at_10\": 0.08855,\n",
+      "        \"precision_at_100\": 0.00991,\n",
+      "        \"mrr_at_10\": 0.55809,\n",
+      "        \"mrr_at_100\": 0.56366\n",
+      "    }\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open('beir/search_results/bge-base-en-v1.5/NoReranker/EVAL/eval_results.json', 'r') as content_file:\n",
+    "    print(content_file.read())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/4_Evaluation/4.5.1.ipynb b/docs/source/tutorial/4_Evaluation/4.5.1.ipynb
new file mode 100644
index 00000000..58dfdc08
--- /dev/null
+++ b/docs/source/tutorial/4_Evaluation/4.5.1.ipynb
@@ -0,0 +1,738 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate on MIRACL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[MIRACL](https://project-miracl.github.io/) (Multilingual Information Retrieval Across a Continuum of Languages) is an WSDM 2023 Cup challenge that focuses on search across 18 different languages. They release a multilingual retrieval dataset containing the train and dev set for 16 “known languages” and only dev set for 2 “surprise languages”. The topics are generated by native speakers of each language, who also label the relevance between the topics and a given document list. You can found the dataset on HuggingFace."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note: We highly recommend you to run the evaluation of MIRACL on GPU. For reference, it takes about an hour for the whole process on a 8xA100 40G node."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First install the libraries we are using:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "% pip install FlagEmbedding pytrec_eval"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With the great number of passages and articles in the 18 languages. MIRACL is a resourceful dataset for training or evaluating multi-lingual model. The data can be downloaded from [Hugging Face](https://huggingface.co/datasets/miracl/miracl)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "| Language        | # of Passages | # of Articles |\n",
+    "|:----------------|--------------:|--------------:|\n",
+    "| Arabic (ar)     |     2,061,414 |       656,982 |\n",
+    "| Bengali (bn)    |       297,265 |        63,762 |\n",
+    "| English (en)    |    32,893,221 |     5,758,285 |\n",
+    "| Spanish (es)    |    10,373,953 |     1,669,181 |\n",
+    "| Persian (fa)    |     2,207,172 |       857,827 |\n",
+    "| Finnish (fi)    |     1,883,509 |       447,815 |\n",
+    "| French (fr)     |    14,636,953 |     2,325,608 |\n",
+    "| Hindi (hi)      |       506,264 |       148,107 |\n",
+    "| Indonesian (id) |     1,446,315 |       446,330 |\n",
+    "| Japanese (ja)   |     6,953,614 |     1,133,444 |\n",
+    "| Korean (ko)     |     1,486,752 |       437,373 |\n",
+    "| Russian (ru)    |     9,543,918 |     1,476,045 |\n",
+    "| Swahili (sw)    |       131,924 |        47,793 |\n",
+    "| Telugu (te)     |       518,079 |        66,353 |\n",
+    "| Thai (th)       |       542,166 |       128,179 |\n",
+    "| Chinese (zh)    |     4,934,368 |     1,246,389 |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "lang = \"en\"\n",
+    "corpus = load_dataset(\"miracl/miracl-corpus\", lang, trust_remote_code=True)['train']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Each passage in the corpus has three parts: `docid`, `title`, and `text`. In the structure of document with docid `x#y`, `x` indicates the id of Wikipedia article, and `y` is the number of passage within that article. The title is the name of the article with id `x` that passage belongs to. The text is the text body of the passage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'docid': '56672809#4',\n",
+       " 'title': 'Glen Tomasetti',\n",
+       " 'text': 'In 1967 Tomasetti was prosecuted after refusing to pay one sixth of her taxes on the grounds that one sixth of the federal budget was funding Australia\\'s military presence in Vietnam. In court she argued that Australia\\'s participation in the Vietnam War violated its international legal obligations as a member of the United Nations. Public figures such as Joan Baez had made similar protests in the USA, but Tomasetti\\'s prosecution was \"believed to be the first case of its kind in Australia\", according to a contemporary news report. Tomasetti was eventually ordered to pay the unpaid taxes.'}"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "corpus[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The qrels have following form:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dev = load_dataset('miracl/miracl', lang, trust_remote_code=True)['dev']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'query_id': '0',\n",
+       " 'query': 'Is Creole a pidgin of French?',\n",
+       " 'positive_passages': [{'docid': '462221#4',\n",
+       "   'text': \"At the end of World War II in 1945, Korea was divided into North Korea and South Korea with North Korea (assisted by the Soviet Union), becoming a communist government after 1946, known as the Democratic People's Republic, followed by South Korea becoming the Republic of Korea. China became the communist People's Republic of China in 1949. In 1950, the Soviet Union backed North Korea while the United States backed South Korea, and China allied with the Soviet Union in what was to become the first military action of the Cold War.\",\n",
+       "   'title': 'Eighth United States Army'},\n",
+       "  {'docid': '29810#23',\n",
+       "   'text': 'The large size of Texas and its location at the intersection of multiple climate zones gives the state highly variable weather. The Panhandle of the state has colder winters than North Texas, while the Gulf Coast has mild winters. Texas has wide variations in precipitation patterns. El Paso, on the western end of the state, averages of annual rainfall, while parts of southeast Texas average as much as per year. Dallas in the North Central region averages a more moderate per year.',\n",
+       "   'title': 'Texas'},\n",
+       "  {'docid': '3716905#0',\n",
+       "   'text': 'A French creole, or French-based creole language, is a creole language (contact language with native speakers) for which French is the \"lexifier\". Most often this lexifier is not modern French but rather a 17th-century koiné of French from Paris, the French Atlantic harbors, and the nascent French colonies. French-based creole languages are spoken natively by millions of people worldwide, primarily in the Americas and on archipelagos throughout the Indian Ocean. This article also contains information on French pidgin languages, contact languages that lack native speakers.',\n",
+       "   'title': 'French-based creole languages'},\n",
+       "  {'docid': '22399755#18',\n",
+       "   'text': 'There are many hypotheses on the origins of Haitian Creole. Linguist John Singler suggests that it most likely emerged under French control in colonial years when shifted its economy focused heavily on sugar production. This resulted in a much larger population of enslaved Africans, whose interaction with the French created the circumstances for the dialect to evolve from a pidgin to a Creole. His research and the research of Claire Lefebvre of the Université du Québec à Montréal suggests that Creole, despite drawing 90% of its lexicon from French, is the syntactic cousin of Fon, a Gbe language of the Niger-Congo family spoken in Benin. At the time of the emergence of Haitian Creole, 50% of the enslaved Africans in Haiti were Gbe speakers.',\n",
+       "   'title': 'Haitian literature'}],\n",
+       " 'negative_passages': [{'docid': '1170520#2',\n",
+       "   'text': 'Louisiana Creole is a contact language that arose in the 18th century from interactions between speakers of the lexifier language of Standard French and several substrate or adstrate languages from Africa. Prior to its establishment as a Creole, the precursor was considered a pidgin language. The social situation that gave rise to the Louisiana Creole language was unique, in that the lexifier language was the language found at the contact site. More often the lexifier is the language that arrives at the contact site belonging to the substrate/adstrate languages. Neither the French, the French-Canadians, nor the African slaves were native to the area; this fact categorizes Louisiana Creole as a contact language that arose between exogenous ethnicities. Once the pidgin tongue was transmitted to the next generation as a \"lingua franca\" (who were then considered the first native speakers of the new grammar), it could effectively be classified as a creole language.',\n",
+       "   'title': 'Louisiana Creole'},\n",
+       "  {'docid': '49823#1',\n",
+       "   'text': 'The precise number of creole languages is not known, particularly as many are poorly attested or documented. About one hundred creole languages have arisen since 1500. These are predominantly based on European languages such as English and French due to the European Age of Discovery and the Atlantic slave trade that arose at that time. With the improvements in ship-building and navigation, traders had to learn to communicate with people around the world, and the quickest way to do this was to develop a pidgin, or simplified language suited to the purpose; in turn, full creole languages developed from these pidgins. In addition to creoles that have European languages as their base, there are, for example, creoles based on Arabic, Chinese, and Malay. The creole with the largest number of speakers is Haitian Creole, with almost ten million native speakers, followed by Tok Pisin with about 4 million, most of whom are second-language speakers.',\n",
+       "   'title': 'Creole language'},\n",
+       "  {'docid': '1651722#10',\n",
+       "   'text': 'Krio is an English-based creole from which descend Nigerian Pidgin English and Cameroonian Pidgin English and Pichinglis. It is also similar to English-based creole languages spoken in the Americas, especially the Gullah language, Jamaican Patois (Jamaican Creole), and Bajan Creole but it has its own distinctive character. It also shares some linguistic similarities with non-English creoles, such as the French-based creole languages in the Caribbean.',\n",
+       "   'title': 'Krio language'},\n",
+       "  {'docid': '540382#4',\n",
+       "   'text': 'Until recently creoles were considered \"degenerate\" dialects of Portuguese unworthy of attention. As a consequence, there is little documentation on the details of their formation. Since the 20th century, increased study of creoles by linguists led to several theories being advanced. The monogenetic theory of pidgins assumes that some type of pidgin language — dubbed West African Pidgin Portuguese — based on Portuguese was spoken from the 15th to 18th centuries in the forts established by the Portuguese on the West African coast. According to this theory, this variety may have been the starting point of all the pidgin and creole languages. This may explain to some extent why Portuguese lexical items can be found in many creoles, but more importantly, it would account for the numerous grammatical similarities shared by such languages, such as the preposition \"na\", meaning \"in\" and/or \"on\", which would come from the Portuguese contraction \"na\" meaning \"in the\" (feminine singular).',\n",
+       "   'title': 'Portuguese-based creole languages'},\n",
+       "  {'docid': '49823#7',\n",
+       "   'text': 'Other scholars, such as Salikoko Mufwene, argue that pidgins and creoles arise independently under different circumstances, and that a pidgin need not always precede a creole nor a creole evolve from a pidgin. Pidgins, according to Mufwene, emerged in trade colonies among \"users who preserved their native vernaculars for their day-to-day interactions.\" Creoles, meanwhile, developed in settlement colonies in which speakers of a European language, often indentured servants whose language would be far from the standard in the first place, interacted extensively with non-European slaves, absorbing certain words and features from the slaves\\' non-European native languages, resulting in a heavily basilectalized version of the original language. These servants and slaves would come to use the creole as an everyday vernacular, rather than merely in situations in which contact with a speaker of the superstrate was necessary.',\n",
+       "   'title': 'Creole language'},\n",
+       "  {'docid': '11236157#2',\n",
+       "   'text': 'While many creoles around the world have lexicons based on languages other than Portuguese (e.g. English, French, Spanish, Dutch), it was hypothesized that such creoles were derived from this lingua franca by means of relexification, i.e. the process in which a pidgin or creole incorporates a significant amount of its lexicon from another language while keeping the grammar intact. There is some evidence that relexification is a real process. Pieter Muysken and show that there are languages which derive their grammar and lexicon from two different languages respectively, which could be easily explained with the relexification hypothesis. Also, Saramaccan seems to be a pidgin frozen in the middle of relexification from Portuguese to English. However, in cases of such mixed languages, as call them, there is never a one-to-one relationship between the grammar or lexicon of the mixed language and the grammar or lexicon of the language they attribute it to.',\n",
+       "   'title': 'Monogenetic theory of pidgins'},\n",
+       "  {'docid': '1612877#8',\n",
+       "   'text': 'A mixed language differs from pidgins, creoles and code-switching in very fundamental ways. In most cases, mixed language speakers are fluent, even native, speakers of both languages; however, speakers of Michif (a N-V mixed language) are unique in that many are not fluent in both of the sources languages. Pidgins, on the other hand, develop in a situation, usually in the context of trade, where speakers of two (or more) different languages come into contact and need to find some way to communicate with each other. Creoles develop when a pidgin language becomes a first language for young speakers. While creoles tend to have drastically simplified morphologies, mixed languages often retain the inflectional complexities of one, or both, of parent languages. For instance, Michif retains the complexities of its French nouns and its Cree verbs.',\n",
+       "   'title': 'Mixed language'},\n",
+       "  {'docid': '9606120#4',\n",
+       "   'text': 'While it is classified as a pidgin language, this is inaccurate. Speakers are already fluent in either English and French, and as such it is not used in situations where both parties lack a common tongue. As a whole, Camfranglais sets itself apart from other pidgins and creoles in that it consists of an array of languages, at least one of which is already known by those speaking it. For instance, while it contains elements of borrowing, code-switching, and pidgin languages, it is not a contact language as both parties can be presumed to speak French, the lexifer. Numerous other classifications have been proposed, like ‘pidgin’, ‘argot’, ‘youth language’, a ‘sabir camerounais’, an ‘appropriation vernaculaire du français’ or a ‘hybrid slang’. However, as Camfranglais is more developed than a slang, this too is insufficient. Kießling proposes it be classified as a \\'highly hybrid sociolect of the urban youth type\", a definition that Stein-Kanjora agrees with.',\n",
+       "   'title': 'Camfranglais'}]}"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dev[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Each item has four parts: `query_id`, `query`, `positive_passages`, and `negative_passages`. Here, `query_id` and `query` correspond to the id and text content of the qeury. `positive_passages` and `negative_passages` are list of passages with their corresponding `docid`, `title`, and `text`. \n",
+    "\n",
+    "This structure is the same in the `train`, `dev`, `testA` and `testB` sets."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we process the ids and text of queries and corpus, and get the qrels of the dev set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "corpus_ids = corpus['docid']\n",
+    "corpus_text = []\n",
+    "for doc in corpus:\n",
+    "   corpus_text.append(f\"{doc['title']} {doc['text']}\".strip())\n",
+    "\n",
+    "queries_ids = dev['query_id']\n",
+    "queries_text = dev['query']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Evaluate from scratch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Embedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the demo we use bge-base-en-v1.5, feel free to change to the model you prefer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'\n",
+    "os.environ['SETUPTOOLS_USE_DISTUTILS'] = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "initial target device: 100%|██████████| 8/8 [00:29<00:00,  3.66s/it]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 52.84it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 55.15it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 56.49it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 55.22it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 49.22it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 54.69it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 49.16it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 50.77it/s]\n",
+      "Chunks: 100%|██████████| 8/8 [00:10<00:00,  1.27s/it]\n",
+      "pre tokenize: 100%|██████████| 16062/16062 [08:12<00:00, 32.58it/s]  \n",
+      "pre tokenize: 100%|██████████| 16062/16062 [08:44<00:00, 30.60it/s]68s/it]\n",
+      "pre tokenize: 100%|██████████| 16062/16062 [08:39<00:00, 30.90it/s]41s/it]\n",
+      "pre tokenize: 100%|██████████| 16062/16062 [09:04<00:00, 29.49it/s]43s/it]\n",
+      "pre tokenize: 100%|██████████| 16062/16062 [09:27<00:00, 28.29it/s]it/s]t]\n",
+      "pre tokenize: 100%|██████████| 16062/16062 [09:08<00:00, 29.30it/s]32s/it]\n",
+      "pre tokenize: 100%|██████████| 16062/16062 [08:59<00:00, 29.77it/s]it/s]t]\n",
+      "pre tokenize: 100%|██████████| 16062/16062 [09:04<00:00, 29.50it/s]29s/it]\n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [17:10<00:00, 15.59it/s] \n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [17:04<00:00, 15.68it/s]]\n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [17:01<00:00, 15.72it/s]s]\n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [17:28<00:00, 15.32it/s]\n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [17:43<00:00, 15.10it/s]\n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [17:27<00:00, 15.34it/s]\n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [17:36<00:00, 15.20it/s]\n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [17:31<00:00, 15.28it/s]\n",
+      "Chunks: 100%|██████████| 8/8 [27:49<00:00, 208.64s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shape of the embeddings: (32893221, 768)\n",
+      "data type of the embeddings:  float16\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import FlagModel\n",
+    "\n",
+    "# get the BGE embedding model\n",
+    "model = FlagModel('BAAI/bge-base-en-v1.5')\n",
+    "\n",
+    "# get the embedding of the queries and corpus\n",
+    "queries_embeddings = model.encode_queries(queries_text)\n",
+    "corpus_embeddings = model.encode_corpus(corpus_text)\n",
+    "\n",
+    "print(\"shape of the embeddings:\", corpus_embeddings.shape)\n",
+    "print(\"data type of the embeddings: \", corpus_embeddings.dtype)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Indexing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a Faiss index to store the embeddings."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total number of vectors: 32893221\n"
+     ]
+    }
+   ],
+   "source": [
+    "import faiss\n",
+    "import numpy as np\n",
+    "\n",
+    "# get the length of our embedding vectors, vectors by bge-base-en-v1.5 have length 768\n",
+    "dim = corpus_embeddings.shape[-1]\n",
+    "\n",
+    "# create the faiss index and store the corpus embeddings into the vector space\n",
+    "index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n",
+    "corpus_embeddings = corpus_embeddings.astype(np.float32)\n",
+    "# train and add the embeddings to the index\n",
+    "index.train(corpus_embeddings)\n",
+    "index.add(corpus_embeddings)\n",
+    "\n",
+    "print(f\"total number of vectors: {index.ntotal}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Searching"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the Faiss index to search for each query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Searching: 100%|██████████| 25/25 [15:03<00:00, 36.15s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "query_size = len(queries_embeddings)\n",
+    "\n",
+    "all_scores = []\n",
+    "all_indices = []\n",
+    "\n",
+    "for i in tqdm(range(0, query_size, 32), desc=\"Searching\"):\n",
+    "    j = min(i + 32, query_size)\n",
+    "    query_embedding = queries_embeddings[i: j]\n",
+    "    score, indice = index.search(query_embedding.astype(np.float32), k=100)\n",
+    "    all_scores.append(score)\n",
+    "    all_indices.append(indice)\n",
+    "\n",
+    "all_scores = np.concatenate(all_scores, axis=0)\n",
+    "all_indices = np.concatenate(all_indices, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then map the search results back to the indices in the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = {}\n",
+    "for idx, (scores, indices) in enumerate(zip(all_scores, all_indices)):\n",
+    "    results[queries_ids[idx]] = {}\n",
+    "    for score, index in zip(scores, indices):\n",
+    "        if index != -1:\n",
+    "            results[queries_ids[idx]][corpus_ids[index]] = float(score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Evaluating"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Download the qrels file for evaluation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "--2024-11-21 10:26:16--  https://hf-mirror.com/datasets/miracl/miracl/resolve/main/miracl-v1.0-en/qrels/qrels.miracl-v1.0-en-dev.tsv\n",
+      "Resolving hf-mirror.com (hf-mirror.com)... 153.121.57.40, 133.242.169.68, 160.16.199.204\n",
+      "Connecting to hf-mirror.com (hf-mirror.com)|153.121.57.40|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 167817 (164K) [text/plain]\n",
+      "Saving to: ‘qrels.miracl-v1.0-en-dev.tsv’\n",
+      "\n",
+      "     0K .......... .......... .......... .......... .......... 30%  109K 1s\n",
+      "    50K .......... .......... .......... .......... .......... 61% 44.5K 1s\n",
+      "   100K .......... .......... .......... .......... .......... 91% 69.6K 0s\n",
+      "   150K .......... ...                                        100% 28.0K=2.8s\n",
+      "\n",
+      "2024-11-21 10:26:20 (58.6 KB/s) - ‘qrels.miracl-v1.0-en-dev.tsv’ saved [167817/167817]\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "endpoint = os.getenv('HF_ENDPOINT', 'https://huggingface.co')\n",
+    "file_name = \"qrels.miracl-v1.0-en-dev.tsv\"\n",
+    "qrel_url = f\"wget {endpoint}/datasets/miracl/miracl/resolve/main/miracl-v1.0-en/qrels/{file_name}\"\n",
+    "\n",
+    "os.system(qrel_url)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read the qrels from the file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qrels_dict = {}\n",
+    "with open(file_name, \"r\", encoding=\"utf-8\") as f:\n",
+    "    for line in f.readlines():\n",
+    "        qid, _, docid, rel = line.strip().split(\"\\t\")\n",
+    "        qid, docid, rel = str(qid), str(docid), int(rel)\n",
+    "        if qid not in qrels_dict:\n",
+    "            qrels_dict[qid] = {}\n",
+    "        qrels_dict[qid][docid] = rel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, use [pytrec_eval](https://github.com/cvangysel/pytrec_eval) library to help us calculate the scores of selected metrics:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "defaultdict(<class 'list'>, {'NDCG@10': 0.46073, 'NDCG@100': 0.54336})\n",
+      "defaultdict(<class 'list'>, {'Recall@10': 0.55972, 'Recall@100': 0.83827})\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pytrec_eval\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "ndcg_string = \"ndcg_cut.\" + \",\".join([str(k) for k in [10,100]])\n",
+    "recall_string = \"recall.\" + \",\".join([str(k) for k in [10,100]])\n",
+    "\n",
+    "evaluator = pytrec_eval.RelevanceEvaluator(\n",
+    "    qrels_dict, {ndcg_string, recall_string}\n",
+    ")\n",
+    "scores = evaluator.evaluate(results)\n",
+    "\n",
+    "all_ndcgs, all_recalls = defaultdict(list), defaultdict(list)\n",
+    "for query_id in scores.keys():\n",
+    "    for k in [10,100]:\n",
+    "        all_ndcgs[f\"NDCG@{k}\"].append(scores[query_id][\"ndcg_cut_\" + str(k)])\n",
+    "        all_recalls[f\"Recall@{k}\"].append(scores[query_id][\"recall_\" + str(k)])\n",
+    "\n",
+    "ndcg, recall = (\n",
+    "    all_ndcgs.copy(),\n",
+    "    all_recalls.copy(),\n",
+    ")\n",
+    "\n",
+    "for k in [10,100]:\n",
+    "    ndcg[f\"NDCG@{k}\"] = round(sum(ndcg[f\"NDCG@{k}\"]) / len(scores), 5)\n",
+    "    recall[f\"Recall@{k}\"] = round(sum(recall[f\"Recall@{k}\"]) / len(scores), 5)\n",
+    "\n",
+    "print(ndcg)\n",
+    "print(recall)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Evaluate using FlagEmbedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We provide independent evaluation for popular datasets and benchmarks. Try the following code to run the evaluation, or run the shell script provided in [example](../../examples/evaluation/miracl/eval_miracl.sh) folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "arguments = \"\"\"- \\\n",
+    "    --eval_name miracl \\\n",
+    "    --dataset_dir ./miracl/data \\\n",
+    "    --dataset_names en \\\n",
+    "    --splits dev \\\n",
+    "    --corpus_embd_save_dir ./miracl/corpus_embd \\\n",
+    "    --output_dir ./miracl/search_results \\\n",
+    "    --search_top_k 100 \\\n",
+    "    --cache_path ./cache/data \\\n",
+    "    --overwrite True \\\n",
+    "    --k_values 10 100 \\\n",
+    "    --eval_output_method markdown \\\n",
+    "    --eval_output_path ./miracl/miracl_eval_results.md \\\n",
+    "    --eval_metrics ndcg_at_10 recall_at_100 \\\n",
+    "    --embedder_name_or_path BAAI/bge-base-en-v1.5 \\\n",
+    "    --devices cuda:0 cuda:1 \\\n",
+    "    --embedder_batch_size 1024\n",
+    "\"\"\".replace('\\n','')\n",
+    "\n",
+    "sys.argv = arguments.split()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/anaconda3/envs/dev/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "initial target device: 100%|██████████| 2/2 [00:09<00:00,  4.98s/it]\n",
+      "pre tokenize: 100%|██████████| 16062/16062 [18:01<00:00, 14.85it/s]  \n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "/root/anaconda3/envs/dev/lib/python3.12/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n",
+      "pre tokenize: 100%|██████████| 16062/16062 [18:44<00:00, 14.29it/s]92s/it]\n",
+      "Inference Embeddings:   0%|          | 42/16062 [00:54<8:28:19,  1.90s/it]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "Inference Embeddings:   0%|          | 43/16062 [00:56<8:22:03,  1.88s/it]/root/anaconda3/envs/dev/lib/python3.12/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [48:29<00:00,  5.52it/s] \n",
+      "Inference Embeddings: 100%|██████████| 16062/16062 [48:55<00:00,  5.47it/s]\n",
+      "Chunks: 100%|██████████| 2/2 [1:10:57<00:00, 2128.54s/it]  \n",
+      "pre tokenize: 100%|██████████| 1/1 [00:11<00:00, 11.06s/it]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:12<00:00, 12.72s/it]\n",
+      "Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00, 32.15it/s]\n",
+      "Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00, 39.80it/s]\n",
+      "Chunks: 100%|██████████| 2/2 [00:31<00:00, 15.79s/it]\n",
+      "Searching: 100%|██████████| 25/25 [00:00<00:00, 26.24it/s]\n",
+      "Qrels not found in ./miracl/data/en/dev_qrels.jsonl. Trying to download the qrels from the remote and save it to ./miracl/data/en.\n",
+      "--2024-11-20 13:00:40--  https://hf-mirror.com/datasets/miracl/miracl/resolve/main/miracl-v1.0-en/qrels/qrels.miracl-v1.0-en-dev.tsv\n",
+      "Resolving hf-mirror.com (hf-mirror.com)... 133.242.169.68, 153.121.57.40, 160.16.199.204\n",
+      "Connecting to hf-mirror.com (hf-mirror.com)|133.242.169.68|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 167817 (164K) [text/plain]\n",
+      "Saving to: ‘./cache/data/miracl/qrels.miracl-v1.0-en-dev.tsv’\n",
+      "\n",
+      "     0K .......... .......... .......... .......... .......... 30%  336K 0s\n",
+      "    50K .......... .......... .......... .......... .......... 61%  678K 0s\n",
+      "   100K .......... .......... .......... .......... .......... 91%  362K 0s\n",
+      "   150K .......... ...                                        100% 39.8K=0.7s\n",
+      "\n",
+      "2024-11-20 13:00:42 (231 KB/s) - ‘./cache/data/miracl/qrels.miracl-v1.0-en-dev.tsv’ saved [167817/167817]\n",
+      "\n",
+      "Loading and Saving qrels: 100%|██████████| 8350/8350 [00:00<00:00, 184554.95it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import HfArgumentParser\n",
+    "\n",
+    "from FlagEmbedding.evaluation.miracl import (\n",
+    "    MIRACLEvalArgs, MIRACLEvalModelArgs,\n",
+    "    MIRACLEvalRunner\n",
+    ")\n",
+    "\n",
+    "\n",
+    "parser = HfArgumentParser((\n",
+    "    MIRACLEvalArgs,\n",
+    "    MIRACLEvalModelArgs\n",
+    "))\n",
+    "\n",
+    "eval_args, model_args = parser.parse_args_into_dataclasses()\n",
+    "eval_args: MIRACLEvalArgs\n",
+    "model_args: MIRACLEvalModelArgs\n",
+    "\n",
+    "runner = MIRACLEvalRunner(\n",
+    "    eval_args=eval_args,\n",
+    "    model_args=model_args\n",
+    ")\n",
+    "\n",
+    "runner.run()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "    \"en-dev\": {\n",
+      "        \"ndcg_at_10\": 0.46053,\n",
+      "        \"ndcg_at_100\": 0.54313,\n",
+      "        \"map_at_10\": 0.35928,\n",
+      "        \"map_at_100\": 0.38726,\n",
+      "        \"recall_at_10\": 0.55972,\n",
+      "        \"recall_at_100\": 0.83809,\n",
+      "        \"precision_at_10\": 0.14018,\n",
+      "        \"precision_at_100\": 0.02347,\n",
+      "        \"mrr_at_10\": 0.54328,\n",
+      "        \"mrr_at_100\": 0.54929\n",
+      "    }\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open('miracl/search_results/bge-base-en-v1.5/NoReranker/EVAL/eval_results.json', 'r') as content_file:\n",
+    "    print(content_file.read())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/4_Evaluation/4.5.2.ipynb b/docs/source/tutorial/4_Evaluation/4.5.2.ipynb
new file mode 100644
index 00000000..4da1ec1e
--- /dev/null
+++ b/docs/source/tutorial/4_Evaluation/4.5.2.ipynb
@@ -0,0 +1,606 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate on MLDR"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[MLDR](https://huggingface.co/datasets/Shitao/MLDR) is a Multilingual Long-Document Retrieval dataset built on Wikipeida, Wudao and mC4, covering 13 typologically diverse languages. Specifically, we sample lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose paragraphs from them. Then we use GPT-3.5 to generate questions based on these paragraphs. The generated question and the sampled article constitute a new text pair to the dataset."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First install the libraries we are using:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "% pip install FlagEmbedding pytrec_eval"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Download the dataset of 13 different languages from [Hugging Face](https://huggingface.co/datasets/Shitao/MLDR)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "| Language Code |  Language  |      Source      | #train  | #dev  | #test | #corpus | Avg. Length of Docs |\n",
+    "| :-----------: | :--------: | :--------------: | :-----: | :---: | :---: | :-----: | :-----------------: |\n",
+    "|      ar       |   Arabic   |    Wikipedia     |  1,817  |  200  |  200  |  7,607  |        9,428        |\n",
+    "|      de       |   German   |  Wikipedia, mC4  |  1,847  |  200  |  200  | 10,000  |        9,039        |\n",
+    "|      en       |  English   |    Wikipedia     | 10,000 |  200  |  800  | 200,000 |        3,308        |\n",
+    "|      es       |  Spanish   |  Wikipedia, mc4  |  2,254  |  200  |  200  |  9,551  |        8,771        |\n",
+    "|      fr       |   French   |    Wikipedia     |  1,608  |  200  |  200  | 10,000  |        9,659        |\n",
+    "|      hi       |   Hindi    |    Wikipedia     |  1,618  |  200  |  200  |  3,806  |        5,555        |\n",
+    "|      it       |  Italian   |    Wikipedia     |  2,151  |  200  |  200  | 10,000  |        9,195        |\n",
+    "|      ja       |  Japanese  |    Wikipedia     |  2,262  |  200  |  200  | 10,000  |        9,297        |\n",
+    "|      ko       |   Korean   |    Wikipedia     |  2,198  |  200  |  200  |  6,176  |        7,832        |\n",
+    "|      pt       | Portuguese |    Wikipedia     |  1,845  |  200  |  200  |  6,569  |        7,922        |\n",
+    "|      ru       |  Russian   |    Wikipedia     |  1,864  |  200  |  200  | 10,000  |        9,723        |\n",
+    "|      th       |    Thai    |       mC4        |  1,970  |  200  |  200  | 10,000  |        8,089        |\n",
+    "|      zh       |  Chinese   | Wikipedia, Wudao | 10,000  |  200  |  800  | 200,000 |        4,249        |\n",
+    "|     Total     |     -      |        -         | 41,434  | 2,600 | 3,800 | 493,709 |        4,737        |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First download the queries and corresponding qrels:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "lang = \"en\"\n",
+    "dataset = load_dataset('Shitao/MLDR', lang, trust_remote_code=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Each item has four parts: `query_id`, `query`, `positive_passages`, and `negative_passages`. `query_id` and `query` correspond to the id and text content of the qeury. `positive_passages` and `negative_passages` are list of passages with their corresponding `docid` and `text`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'query_id': 'q-en-1',\n",
+       " 'query': 'What is the syntax for the shorthand of the conditional operator in PHP 5.3?',\n",
+       " 'positive_passages': [{'docid': 'doc-en-8',\n",
+       "   'text': 'In computer programming,  is a ternary operator that is part of the syntax for basic conditional expressions in several programming languages. It is commonly referred to as the conditional operator, inline if (iif), or ternary if. An expression  evaluates to  if the value of  is true, and otherwise to . One can read it aloud as \"if a then b otherwise c\".\\n\\nIt originally comes from CPL, in which equivalent syntax for e1 ? e2 : e3 was e1 → e2, e3.\\n\\nAlthough many ternary operators are possible, the conditional operator is so common, and other ternary operators so rare, that the conditional operator is commonly referred to as the ternary operator.\\n\\nVariations\\nThe detailed semantics of \"the\" ternary operator as well as its syntax differs significantly from language to language.\\n\\nA top level distinction from one language to another is whether the expressions permit side effects (as in most procedural languages) and whether the language provides short-circuit evaluation semantics, whereby only the selected expression is evaluated (most standard operators in most languages evaluate all arguments).\\n\\nIf the language supports expressions with side effects but does not specify short-circuit evaluation, then a further distinction exists about which expression evaluates first—if the language guarantees any specific order (bear in mind that the conditional also counts as an expression).\\n\\nFurthermore, if no order is guaranteed, a distinction exists about whether the result is then classified as indeterminate (the value obtained from some order) or undefined (any value at all at the whim of the compiler in the face of side effects, or even a crash).\\n\\nIf the language does not permit side-effects in expressions (common in functional languages), then the order of evaluation has no value semantics—though it may yet bear on whether an infinite recursion terminates, or have other performance implications (in a functional language with match expressions, short-circuit evaluation is inherent, and natural uses for the ternary operator arise less often, so this point is of limited concern).\\n\\nFor these reasons, in some languages the statement form  can have subtly different semantics than the block conditional form } (in the C language—the syntax of the example given—these are in fact equivalent).\\n\\nThe associativity of nested ternary operators can also differ from language to language. In almost all languages, the ternary operator is right associative so that  evaluates intuitively as , but PHP in particular is notoriously left-associative, and evaluates as follows: , which is rarely what any programmer expects. (The given examples assume that the ternary operator has low operator precedence, which is true in all C-family languages, and many others.)\\n\\nEquivalence to map\\nThe ternary operator can also be viewed as a binary map operation.\\n\\nIn R—and other languages with literal expression tuples—one can simulate the ternary operator with something like the R expression  (this idiom is slightly more natural in languages with 0-origin subscripts).\\n\\nHowever, in this idiom it is almost certain that the entire tuple expression will evaluate prior to the subscript expression, so there will be no short-circuit semantics.\\n\\nNested ternaries can be simulated as  where the function  returns the index of the first true value in the condition vector. Note that both of these map equivalents are binary operators, revealing that the ternary operator is ternary in syntax, rather than semantics. These constructions can be regarded as a weak form of currying based on data concatenation rather than function composition.\\n\\nIf the language provides a mechanism of futures or promises, then short-circuit evaluation can sometimes also be simulated in the context of a binary map operation.\\n\\nConditional assignment\\n is used as follows:\\n\\n condition ? value_if_true : value_if_false\\n\\nThe condition is evaluated true or false as a Boolean expression. On the basis of the evaluation of the Boolean condition, the entire expression returns value_if_true if condition is true, but value_if_false otherwise. Usually the two sub-expressions value_if_true and value_if_false must have the same type, which determines the type of the whole expression. The importance of this type-checking lies in the operator\\'s most common use—in conditional assignment statements. In this usage it appears as an expression on the right side of an assignment statement, as follows:\\n\\n variable = condition ? value_if_true : value_if_false\\n\\nThe ?: operator is similar to the way conditional expressions (if-then-else constructs) work in functional programming languages, like Scheme, ML, and Haskell, since if-then-else forms an expression instead of a statement in those languages.\\n\\nUsage\\nThe conditional operator\\'s most common usage is to make a terse simple conditional assignment statement. For example, if we wish to implement some C code to change a shop\\'s normal opening hours from 9 o\\'clock to 12 o\\'clock on Sundays, we may use\\n\\nint opening_time = (day == SUNDAY) ? 12 : 9;\\n\\ninstead of the more verbose\\n\\nint opening_time;\\n\\nif (day == SUNDAY)\\n    opening_time = 12;\\nelse\\n    opening_time = 9;\\n\\nThe two forms are nearly equivalent. Keep in mind that the  is an expression and if-then-else is a statement. Note that neither the true nor false portions can be omitted from the conditional operator without an error report upon parsing. This contrasts with if-then-else statements, where the else clause can be omitted.\\n\\nMost of the languages emphasizing functional programming don\\'t need such an operator as their regular conditional expression(s) is an expression in the first place e.g. the Scheme expression  is equivalent in semantics to the C expression . This is also the case in many imperative languages, starting with ALGOL where it is possible to write , or Smalltalk () or Ruby (, although  works as well).\\n\\nNote that some languages may evaluate both the true- and false-expressions, even though only one or the other will be assigned to the variable. This means that if the true- or false-expression contain a function call, that function may be called and executed (causing any related side-effects due to the function\\'s execution), regardless of whether or not its result will be used. Programmers should consult their programming language specifications or test the ternary operator to determine whether or not the language will evaluate both expressions in this way. If it does, and this is not the desired behaviour, then an if-then-else statement should be used.\\n\\nActionScript 3\\ncondition ? value_if_true : value_if_false\\n\\nAda\\nThe 2012 edition of Ada has introduced conditional expressions (using  and ), as part of an enlarged set of expressions including quantified expressions and expression functions. The Rationale for Ada 2012 states motives for Ada not having had them before, as well as motives for now adding them, such as to support \"contracts\" (also new).\\n\\nPay_per_Hour := (if Day = Sunday\\n   then 12.50\\n   else 10.00);\\n\\nWhen the value of an if_expression is itself of Boolean type, then the  part may be omitted, the value being True. Multiple conditions may chained using .\\n\\nALGOL 68\\nBoth ALGOL 68\\'s choice clauses (if and the case clauses) provide the coder with a choice of either the \"bold\" syntax or the \"brief\" form.\\n\\n Single if choice clause:\\n if condition then statements [ else statements ] fi\\n \"brief\" form:  ( condition | statements | statements )\\n\\n Chained if choice clause:\\n if condition1 then statements elif condition2 then statements [ else statements ] fi\\n \"brief\" form:  ( condition1 | statements |: condition2 | statements | statements )\\n\\nAPL\\nWith the following syntax, both expressions are evaluated (with  evaluated first, then , then ):\\n\\nresult ← value_if_true ⊣⍣ condition ⊢ value_if_false\\n\\nThis alternative syntax provides short-circuit evaluation:\\n\\nresult ← { condition : expression_if_true ⋄ expression_if_false } ⍬\\n\\nAWK\\nresult = condition ? value_if_true : value_if_false\\n\\nBash\\nA true ternary operator only exists for arithmetic expressions:\\n\\n((result = condition ? value_if_true : value_if_false))\\n\\nFor strings there only exist workarounds, like e.g.:\\n\\nresult=$([[ \"$a\" = \"$b\" ]] && echo \"value_if_true\" || echo \"value_if_false\")\\n\\nWhere  can be any condition  construct can evaluate. Instead of the  there can be any other bash command. When it exits with success, the first echo command is executed, otherwise the second one is executed.\\n\\nC\\nA traditional if-else construct in C, Java and JavaScript is written:\\n\\nif (a > b) {\\n    result = x;\\n}\\nelse {\\n    result = y;\\n}\\n\\nThis can be rewritten as the following statement:\\n\\nresult = a > b ? x : y;\\n\\nAs in the if-else construct only one of the expressions \\'x\\' and \\'y\\' is evaluated. This is significant if the evaluation of \\'x\\' or \\'y\\' has side effects. The behaviour is undefined if an attempt is made to use the result of the conditional operator as an lvalue.\\n\\nA GNU extension to C allows omitting the second operand, and using implicitly the first operand as the second also:\\n\\na == x ? : y;\\n\\nThe expression is equivalent to\\n\\na == x ? (a == x) : y;\\n\\nexcept that if x is an expression, it is evaluated only once. The difference is significant if evaluating the expression has side effects. This shorthand form is sometimes known as the Elvis operator in other languages.\\n\\nC#\\nIn C#, if condition is true, first expression is evaluated and becomes the result; if false, the second expression is evaluated and becomes the result. As with Java only one of two expressions is ever evaluated.\\n\\n// condition ? first_expression : second_expression;\\n\\nstatic double sinc(double x) \\n{\\n     return x != 0.0 ? Math.Sin(x) / x : 1.0;\\n}\\n\\nC++\\nUnlike in C, the precedence of the  operator in C++ is the same as that of the assignment operator ( or ), and it can return an lvalue. This means that expressions like  and  are both legal and are parsed differently, the former being equivalent to .\\n\\nIn C++ there are conditional assignment situations where use of the if-else statement is impossible, since this language explicitly distinguishes between initialization and assignment. In such case it is always possible to use a function call, but this can be cumbersome and inelegant. For example, to pass conditionally different values as an argument for a constructor of a field or a base class, it is impossible to use a plain if-else statement; in this case we can use a conditional assignment expression, or a function call. Bear in mind also that some types allow initialization, but do not allow assignment, or even that the assignment operator and the constructor do totally different things. This last is true for reference types, for example:\\n\\n#include <iostream>\\n#include <fstream>\\n#include <string>\\n\\nint main(int argc, char *argv[])\\n{\\n    std::string name;\\n    std::ofstream fout;\\n\\n    if (argc > 1 && argv[1])\\n    {\\n        name = argv[1];\\n        fout.open(name.c_str(), std::ios::out | std::ios::app);\\n    }\\n\\n    std::ostream &sout = name.empty() ? std::cout : fout;\\n\\n    sout << \"Hello, world!\\\\n\";\\n\\n    return 0;\\n}\\n\\nIn this case there is no possibility of using an if-else statement in place of the  operator (Although we can replace the use of  with a function call, inside of which can be an if-else statement).\\n\\nFurthermore, the conditional operator can yield an lvalue, i.e. a value to which another value can be assigned. Consider the following example:\\n\\n#include <iostream>\\n\\nint main(int argc, char *argv[]) \\n{\\n    int a = 0;\\n    int b = 0;\\n\\n    (argc > 1 ? a : b) = 1;\\n\\n    std::cout << \"a: \" << a\\n              << \" b: \" << b\\n              << \\'\\\\n\\';\\n\\n    return 0;\\n}\\n\\nIn this example, if the boolean expression  yields the value  on line 8, the value  is assigned to the variable , otherwise the value  is assigned to the variable .\\n\\nIn C++ and other various languages, ternary operators like  are also possible but are very rare.\\n\\nCFML\\nExample of the  operator in CFML:\\n\\nresult = randRange(0,1) ? \"heads\" : \"tails\";\\n\\nRoughly 50% of the time the  expression will return 1 (true) or 0 (false); meaning result will take the value \"heads\" or \"tails\" respectively.\\n\\nLucee, Railo, and ColdFusion 11-specific\\nLucee, Railo, and ColdFusion 11 also implement the Elvis operator,  which will return the value of the expression if it is not-null, otherwise the specified default.\\n\\nSyntax:\\n\\nresult = expression ?: value_if_expression_is_null\\n\\nExample:\\n\\nresult = f() ?: \"default\";\\n\\n// where...\\nfunction f(){\\n    if (randRange(0,1)){ // either 0 or 1 (false / true)\\n        return \"value\";\\n    }\\n}\\n\\nwriteOutput(result);\\n\\nThe function  will return  roughly 50% of the time, otherwise will not return anything. If  returns \"value\",  will take that value, otherwise will take the value \"default\".\\n\\nCoffeeScript\\nExample of using this operator in CoffeeScript:\\n\\nif 1 is 2 then \"true value\" else \"false value\"\\n\\nReturns \"false value\".\\n\\nCommon Lisp\\nAssignment using a conditional expression in Common Lisp:\\n\\n(setf result (if (> a b) x y))\\n\\nAlternative form:\\n\\n(if (> a b)\\n  (setf result x)\\n  (setf result y))\\n\\nCrystal\\nExample of using this operator in Crystal:\\n\\n1 == 2 ? \"true value\" : \"false value\"\\n\\nReturns .\\n\\nThe Crystal compiler transforms conditional operators to  expressions, so the above is semantically identical to:\\n\\nif 1 == 2\\n  \"true value\"\\nelse\\n  \"false value\"\\nend\\n\\nDart\\nThe Dart programming language\\'s syntax belongs to the C family, primarily inspired by languages like Java, C# and JavaScript, which means it has inherited the traditional  syntax for its conditional expression.\\n\\nExample:\\n\\nreturn x.isEven ? x ~/ 2 : x * 3 + 1;\\n\\nLike other conditions in Dart, the expression before the  must evaluate to a Boolean value.\\n\\nThe Dart syntax uses both  and  in various other ways, which causes ambiguities in the language grammar. An expression like:\\n\\n{ x as T ? [1] : [2] }\\n\\ncould be parsed as either a \"set literal\" containing one of two lists or as a \"map literal\" {((x as T?)[1]) : [2]}. The language always chooses the conditional expression in such situations.\\n\\nDart also has a second ternary operator, the  operator commonly used for setting values in lists or maps, which makes the term \"the ternary operator\" ambiguous in a Dart context.\\n\\nDelphi\\nIn Delphi the  function can be used to achieve the same as . If the  library is used, the  function returns a numeric value such as an Integer, Double or Extended. If the  library is used, this function can also return a string value.\\n\\nUsing \\n\\nfunction IfThen(AValue: Boolean; const ATrue: Integer; const AFalse: Integer): Integer;\\nfunction IfThen(AValue: Boolean; const ATrue: Int64; const AFalse: Int64): Int64;\\nfunction IfThen(AValue: Boolean; const ATrue: UInt64; const AFalse: UInt64): UInt64;\\nfunction IfThen(AValue: Boolean; const ATrue: Single; const AFalse: Single): Single;\\nfunction IfThen(AValue: Boolean; const ATrue: Double; const AFalse: Double): Double;\\nfunction IfThen(AValue: Boolean; const ATrue: Extended; const AFalse: Extended): Extended;\\n\\nUsing the  library\\n\\nfunction IfThen(AValue: Boolean; const ATrue: string; AFalse: string = \\'\\'): string;\\n\\nUsage example:\\n\\nfunction GetOpeningTime(Weekday: Integer): Integer;\\nbegin\\n  { This function will return the opening time for the given weekday: 12 for Sundays, 9 for other days }\\n  Result := IfThen((Weekday = 1) or (Weekday = 7), 12, 9);\\nend;\\n\\nUnlike a true ternary operator however, both of the results are evaluated prior to performing the comparison. For example, if one of the results is a call to a function which inserts a row into a database table, that function will be called whether or not the condition to return that specific result is met.\\n\\nF#\\n\\nIn F# the built-in syntax for if-then-else is already an expression that always must return a value.\\n\\nlet num = if x = 10 then 42 else 24\\n\\nF# has a special case where you can omit the else branch if the return value is of type unit. This way you can do side-effects, without using a else branch.\\n\\nif x = 10 then\\n    printfn \"It is 10\"\\n\\nBut even in this case, the if expression would return unit. You don\\'t need to write the else branch, because the compiler will assume the unit type on else.\\n\\nFORTH\\nSince FORTH is a stack-oriented language, and any expression can leave a value on the stack, all // sequences can generate values:\\n\\n: test ( n -- n )  1 AND  IF 22 ELSE 42 THEN ;\\n\\nThis word takes 1 parameter on the stack, and if that number is odd, leaves 22. If it\\'s even, 42 is left on the stack.\\n\\nFortran\\nWith the additions to the code in the 1995 release, the ternary operator was added to the Fortran compiler as the intrinsic function :\\n\\nvariable = merge(x,y,a>b)\\n\\nNote that both x and y are evaluated before the results of one or the other are returned from the function. Here, x is returned if the condition holds true and y otherwise.\\n\\nFreeMarker \\nThis built-in exists since FreeMarker 2.3.20.\\n\\nUsed like booleanExp?then(whenTrue, whenFalse), fills the same role as the ternary operator in C-like languages.\\n\\n<#assign x = 10>\\n<#assign y = 20>\\n<#-- Prints the maximum of x and y: -->\\n${(x > y)?then(x, y)}\\n\\nGo\\nThere is no ternary if in Go, so use of the full if statement is always required.\\n\\nHaskell\\nThe built-in if-then-else syntax is inline: the expression\\n\\nif predicate then expr1 else expr2\\n\\nhas type\\n\\nBool -> a -> a -> a\\n\\nThe base library also provides the function :\\n\\nbool :: a -> a -> Bool -> a\\n\\nIn both cases, no special treatment is needed to ensure that only the selected expression is evaluated, since Haskell is non-strict by default. This also means an operator can be defined that, when used in combination with the  operator, functions exactly like  in most languages:\\n\\n(?) :: Bool -> a -> a -> a\\n(?) pred x y = if pred then x else y\\ninfix 1 ?\\n\\n-- example (vehicle will evaluate to \"airplane\"):\\narg = \\'A\\'\\nvehicle = arg == \\'B\\' ? \"boat\" $\\n          arg == \\'A\\' ? \"airplane\" $\\n          arg == \\'T\\' ? \"train\" $\\n                       \"car\"\\n\\nHowever, it is more idiomatic to use pattern guards\\n\\n-- example (vehicle will evaluate to \"airplane\"):\\narg = \\'A\\'\\nvehicle | arg == \\'B\\' = \"boat\"\\n        | arg == \\'A\\' = \"airplane\"\\n        | arg == \\'T\\' = \"train\"\\n        | otherwise  = \"car\"\\n\\nJava\\nIn Java this expression evaluates to:\\n\\n// If foo is selected, assign selected foo to bar. If not, assign baz to bar.\\nObject bar = foo.isSelected() ? foo : baz; \\n\\nNote that Java, in a manner similar to C#, only evaluates the used expression and will not evaluate the unused expression.\\n\\nJulia\\nIn Julia, \"Note that the spaces around  and  are mandatory: an expression like  is not a valid ternary expression (but a newline is acceptable after both the  and the ).\"\\n\\nJavaScript\\nThe conditional operator in JavaScript is similar to that of C++ and Java, except for the fact the middle expression cannot be a comma expression. Also, as in C++, but unlike in C or Perl, it will not bind tighter than an assignment to its right— is equivalent to  instead of .\\n\\nvar timeout = settings !== null ? settings.timeout : 1000;\\n\\nJust like C# and Java, the expression will only be evaluated if, and only if, the expression is the matching one for the condition given; the other expression will not be evaluated.\\n\\nKotlin \\nKotlin does not include the traditional  ternary operator, however, s can be used as expressions that can be assigned, achieving the same results. Note that, as the complexity of your conditional statement grows, you might consider replacing your - expression with a  expression.\\n\\nval max = if (a > b) a else b\\n\\nLua \\nLua does not have a traditional conditional operator. However, the short-circuiting behaviour of its  and  operators allows the emulation of this behaviour:\\n\\n-- equivalent to var = cond ? a : b;\\nvar = cond and a or b\\n\\nThis will succeed unless  is logically false (i.e.  or ); in this case, the expression will always result in . This can result in some surprising behaviour if ignored.\\n\\nSQL\\nThe SQL  expression is a generalization of the ternary operator. Instead of one conditional and two results, n conditionals and n+1 results can be specified.\\n\\nWith one conditional it is equivalent (although more verbose) to the ternary operator:\\n\\nSELECT (CASE WHEN a > b THEN x ELSE y END) AS CONDITIONAL_EXAMPLE\\n  FROM tab;\\n\\nThis can be expanded to several conditionals:\\n\\nSELECT (CASE WHEN a > b THEN x WHEN a < b THEN y ELSE z END) AS CONDITIONAL_EXAMPLE\\n  FROM tab;\\n\\nMySQL\\nIn addition to the standard  expression, MySQL provides an  function as an extension:\\n\\nIF(cond, a, b);\\n\\nSQL Server\\nIn addition to the standard  expression, SQL Server (from 2012) provides an  function:\\n\\nIIF(condition, true_value, false_value)\\n\\nOracle SQL\\nIn addition to the standard  expression, Oracle has a variadic functional counterpart which operates similarly to a switch statement and can be used to emulate the conditional operator when testing for equality.\\n\\n-- General syntax takes case-result pairs, comparing against an expression, followed by a fall-back result:\\nDECODE(expression, case1, result1,\\n                   ...\\n                   caseN, resultN,\\n                          resultElse)\\n\\n-- We can emulate the conditional operator by just selecting one case:\\nDECODE(expression, condition, true, false)\\n\\nThe  function is, today, deprecated in favour of the standard  expression. This can be used in both Oracle SQL queries as well as PL/SQL blocks, whereas  can only be used in the former.\\n\\nPerl\\nA traditional if-else construct in Perl is written:\\n\\nif ($a > $b) {\\n    $result = $x;\\n} else {\\n    $result = $y;\\n}\\n\\nRewritten to use the conditional operator:\\n\\n$result = $a > $b ? $x : $y;\\n\\nThe precedence of the conditional operator in perl is the same as in C, not as in C++. This is conveniently of higher precedence than a comma operator but lower than the precedence of most operators used in expressions within the ternary operator, so the use of parentheses is rarely required.\\n\\nIts associativity matches that of C and C++, not that of PHP. Unlike C but like C++, perl allows the use of the conditional expression as an L-value; for example:\\n\\n$a > $b ? $x : $y = $result;\\n\\nwill assign  to either  or  depending on the logical expression\\'s boolean result.\\n\\nThe respective precedence rules and associativities of the operators used guarantee that the version absent any parentheses is equivalent to this explicitly parenthesized version:\\n\\n(($a > $b) ? $x : $y) = $result;\\n\\nThis is equivalent to the if-else version:\\n\\nif ($a > $b) {\\n    $x = $result;\\n} else {\\n    $y = $result;\\n}\\n\\nPHP\\nA simple PHP implementation is this:\\n\\n$abs = $value >= 0 ? $value : -$value;\\n\\nDue to an unfortunate design of the language grammar, the conditional operator in PHP is left associative in contrast to other languages, thus given a value of T for arg, the PHP code in the following example would yield the value horse instead of train as one might expect:\\n\\n<?php\\n$arg = \"T\";\\n$vehicle = ( ( $arg == \\'B\\' ) ? \\'bus\\' : \\n             ( $arg == \\'A\\' ) ? \\'airplane\\' : \\n             ( $arg == \\'T\\' ) ? \\'train\\' : \\n             ( $arg == \\'C\\' ) ? \\'car\\' : \\n             ( $arg == \\'H\\' ) ? \\'horse\\' : \\n                               \\'feet\\' );\\necho $vehicle;\\n\\nThe reason is that nesting two conditional operators produces an oversized condition with the last two options as its branches:  is really . This is acknowledged and will probably not change. To avoid this, nested parenthesis are needed, as in this example:\\n\\n<?php\\n$arg = \"T\";\\n$vehicle = $arg == \"B\" ? \"bus\" :\\n          ($arg == \"A\" ? \"airplane\" :\\n          ($arg == \"T\" ? \"train\" :\\n          ($arg == \"C\" ? \"car\" :\\n          ($arg == \"H\" ? \"horse\" :\\n                         \"feet\"))));\\necho $vehicle;\\n\\nThis will produce the result of train being printed to the output, analogous to a right associative conditional operator.\\n\\nPHP 5.3\\n\\nSince PHP 5.3 there is a shorthand of the conditional operator, sometimes referred to as the \"Elvis Operator\". The syntax for this shorthand is below:\\n\\n$c = $a ?: $b; // equivalent to $c = $a ? $a : $b;\\n\\nPython\\nThough it had been delayed for several years by disagreements over syntax, an operator for a conditional expression in Python was approved as Python Enhancement Proposal 308 and was added to the 2.5 release in September 2006. Python\\'s conditional operator differs from the common  operator in the order of its operands. The general form is:\\n\\nresult = x if a > b else y\\n\\nThis form invites considering  as the normal value and  as an exceptional case. \\n\\nPrior to Python\\xa02.5 there were a number of ways to approximate a conditional operator (for example by indexing into a two element array), all of which have drawbacks as compared to the built-in operator.\\n\\nR\\nThe traditional if-else construct in R (which is an implementation of S) is:\\n\\nif (a < b) {\\n  x <- \"true\"\\n} else {\\n  x <- \"false\"\\n}\\n\\nIf there is only one statement in each block, braces can be omitted, like in C:\\n\\nif (a < b)\\n  x <- \"true\"\\nelse\\n  x <- \"false\"\\n\\nThe code above can be written in the following non-standard condensed way:\\n\\nx <- if (a < b) \"true\" else \"false\"\\n\\nThere exists also the function  that allows rewriting the expression above as:\\n\\nx <- ifelse(a < b, \"true\", \"false\")\\n\\nThe  function is automatically vectorized. For instance:\\n\\n> ifelse(c (0, 2) < 1, \"true\", \"false\")\\n[1] \"true\"  \"false\"\\n\\nRaku\\nRaku uses a doubled  symbol instead of single \\nand a doubled  symbol instead of \\n\\n$result = $a > $b ?? $x !! $y;\\n\\nRuby\\nExample of using this operator in Ruby:\\n\\n1 == 2 ? \"true value\" : \"false value\"\\n\\nReturns \"false value\".\\n\\nA traditional if-else construct in Ruby is written:\\n\\nif a > b\\n  result = x\\nelse\\n  result = y\\nend\\n\\nThis could also be written as:\\n\\nresult = if a > b\\n  x\\nelse\\n  y\\nend\\n\\nThese can be rewritten as the following statement:\\n\\nresult = a > b ? x : y\\n\\nRust\\nBeing an expression-oriented programming language, Rust\\'s existing if expr1 else expr2 syntax can behave as the traditional  ternary operator does. Earlier versions of the language did have the  operator but it was removed due to duplication with .\\n\\nNote the lack of semi-colons in the code below compared to a more declarative ... block, and the semi-colon at the end of the assignment to .\\n\\nlet x = 5;\\n\\nlet y = if x == 5 {\\n    10\\n} else {\\n    15\\n};\\n\\nThis could also be written as:\\n\\nlet y = if x == 5 { 10 } else { 15 };\\n\\nNote that curly braces are mandatory in Rust conditional expressions.\\n\\nYou could also use a  expression:\\n\\nlet y = match x {\\n    5 => 10,\\n    _ => 15,\\n};\\n\\nScheme\\nSame as in Common Lisp. Every expression has a value. Thus the builtin  can be used:\\n\\n(let* ((x 5)\\n       (y (if (= x 5) 10 15)))\\n  ...)\\n\\nSmalltalk\\nEvery expression (message send) has a value. Thus  can be used:\\n\\n|x y|\\n\\nx := 5.\\ny := (x == 5) ifTrue:[10] ifFalse:[15].\\n\\nSwift\\nThe ternary conditional operator of Swift is written in the usual way of the C tradition, and is used within expressions.\\n\\nlet result = a > b ? a : b\\n\\nTcl\\nIn Tcl, this operator is available in expr expressions only:\\n\\nset x 5\\nset y [expr {$x == 5 ? 10 : 15}]\\n\\nOutside of expr, if can be used for a similar purpose, as it also returns a value:\\npackage require math\\n\\nset x 5\\nset y [if {$x == 5} {\\n    ::math::random $x\\n} else {\\n    ::math::fibonacci $x\\n}]\\n\\nTestStand\\nIn a National Instruments TestStand expression, if condition is true, the first expression is evaluated and becomes the output of the conditional operation; if false, the second expression is evaluated and becomes the result. Only one of two expressions is ever evaluated.\\n\\ncondition ? first_expression : second_expression\\n\\nFor example:\\n\\nRunState.Root.Parameters.TestSocket.Index == 3 ? Locals.UUTIndex = 3 : Locals.UUTIndex = 0\\n\\nSets the  local variable to 3 if  is 3, otherwise it sets  to 0.\\n\\nSimilar to other languages, first_expression and second_expression do not need to be autonomous expressions, allowing the operator to be used for variable assignment:\\n\\nLocals.UUTIndex = ( RunState.Root.Parameters.TestSocket.Index == 3 ? 3 : 0 )\\n\\nVerilog\\nVerilog is technically a hardware description language, not a programming language though the semantics of both are very similar. It uses the  syntax for the ternary operator.\\n\\n// using blocking assignment\\nwire out;\\nassign out = sel ? a : b;\\n\\nThis is equivalent to the more verbose Verilog code:\\n\\n// using blocking assignment\\nwire out;\\nif (sel === 1)  // sel is 1, not 0, x or z\\n    assign out = a;\\nelse if (sel === 0)  // sel is 0, x or z (1 checked above)\\n    assign out = b;\\nelse  // sel is x or z (0 and 1 checked above)\\n    assign out = [comment];  // a and b are compared bit by bit, and return for each bit\\n                             // an x if bits are different, and the bit value if the same\\n\\nVisual Basic\\nVisual Basic doesn\\'t use  per se, but has a very similar implementation of this shorthand  statement. Using the first example provided in this article, it can do:\\n\\n\\' variable = IIf(condition, value_if_true, value_if_false)\\nDim opening_time As Integer = IIf((day = SUNDAY), 12, 9)\\n\\nIn the above example,  is a ternary function, but not a ternary operator. As a function, the values of all three portions are evaluated before the function call occurs. This imposed limitations, and in Visual Basic .Net 9.0, released with Visual Studio 2008, an actual conditional operator was introduced, using the  keyword instead of . This allows the following example code to work:\\n\\nDim name As String = If(person Is Nothing, \"\", person.Name)\\n\\nUsing ,  would be evaluated even if person is  (Nothing), causing an exception. With a true short-circuiting conditional operator,  is not evaluated unless person is not .\\n\\nVisual Basic Version 9 has added the operator  in addition to the existing  function that existed previously. As a true operator, it does not have the side effects and potential inefficiencies of the  function.\\n\\nThe syntaxes of the tokens are similar:  vs . As mentioned above, the function call has significant disadvantages, because the sub-expressions must all be evaluated, according to Visual Basic\\'s evaluation strategy for function calls and the result will always be of type variant (VB) or object (VB.NET). The operator however does not suffer from these problems as it supports conditional evaluation and determines the type of the expression based on the types of its operands.\\n\\nResult type\\nClearly the type of the result of the  operator must be in some sense the type unification of the types of its second and third operands. In C this is accomplished for numeric types by arithmetic promotion; since C does not have a type hierarchy for pointer types, pointer operands may only be used if they are of the same type (ignoring type qualifiers) or one is void or NULL. It is undefined behaviour to mix pointer and integral or incompatible pointer types; thus\\n\\nnumber = spell_out_numbers ? \"forty-two\" : 42;\\n\\nwill result in a compile-time error in most compilers.\\n\\n?: in style guidelines\\nConditional operators are widely used and can be useful in certain circumstances to avoid the use of an  statement, either because the extra verbiage would be too lengthy or because the syntactic context does not permit a statement. For example:\\n\\n #define MAX(a, b) (((a)>(b)) ? (a) : (b))\\n\\nor\\n\\n for (i = 0; i < MAX_PATTERNS; i++)\\n    c_patterns[i].ShowWindow(m_data.fOn[i] ? SW_SHOW : SW_HIDE);\\n\\n(The latter example uses the Microsoft Foundation Classes Framework for Win32.)\\n\\nInitialization\\nAn important use of the conditional operator is in allowing a single initialization statement, rather than multiple initialization statements. In many cases this also allows single assignment and for an identifier to be a constant.\\n\\nThe simplest benefit is avoiding duplicating the variable name, as in Python:\\n\\nx = \\'foo\\' if b else \\'bar\\'\\n\\ninstead of:\\n\\nif b:\\n    x = \\'foo\\'\\nelse:\\n    x = \\'bar\\'\\n\\nMore importantly, in languages with block scope, such as C++, the blocks of an if/else statement create new scopes, and thus variables must be declared before the if/else statement, as:\\n\\nstd::string s;\\nif (b)\\n    s = \"foo\";\\nelse\\n    s = \"bar\";\\n\\nUse of the conditional operator simplifies this:\\n\\nstd::string s = b ? \"foo\" : \"bar\";\\n\\nFurthermore, since initialization is now part of the declaration, rather than a separate statement, the identifier can be a constant (formally, of  type):\\n\\nconst std::string s = b ? \"foo\" : \"bar\";\\n\\nCase selectors\\nWhen properly formatted, the conditional operator can be used to write simple and coherent case selectors. For example:\\n\\nvehicle = arg == \\'B\\' ? bus :\\n          arg == \\'A\\' ? airplane :\\n          arg == \\'T\\' ? train :\\n          arg == \\'C\\' ? car :\\n          arg == \\'H\\' ? horse :\\n                       feet;\\n\\nAppropriate use of the conditional operator in a variable assignment context reduces the probability of a bug from a faulty assignment as the assigned variable is stated just once as opposed to multiple times.\\n\\nProgramming languages without the conditional operator\\nThe following are examples of notable general-purpose programming languages that don\\'t provide a conditional operator:\\n\\n CoffeeScript\\n Go programming language\\n MATLAB\\n Pascal although Object Pascal / Delphi do have a function  to do the same (with caveats)\\n Rust The  construct is an expression and can be used to get the same functionality.\\n \\n PowerShell (in old versions) an elegant workaround is to use (<value for true>,<value for false>)[!(<condition>)]\\n\\nSee also\\n IIf, inline if function\\n Null coalescing operator,  operator\\n Elvis operator, , or sometimes , as a shorthand binary operator\\n Conditioned disjunction, equivalent ternary logical connective.\\n\\nReferences\\n\\nExternal links\\n Description of If operator in Visual Basic\\n Description of Conditional Expression in Python (PEP 308)\\n Description in the Java Language Specification\\n Description in the PHP Language Documentation\\n\\nConditional constructs\\nOperators (programming)\\nTernary operations\\nArticles with example code\\n\\nde:Bedingte Anweisung und Verzweigung#Auswahloperator'}],\n",
+       " 'negative_passages': [{'docid': 'doc-en-9',\n",
+       "   'text': 'The Pirates of Penzance; or, The Slave of Duty is a comic opera in two acts, with music by Arthur Sullivan and libretto by W.\\xa0S.\\xa0Gilbert. The opera\\'s official premiere was at the Fifth Avenue Theatre in New York City on 31 December 1879, where the show was well received by both audiences and critics. Its London debut was on 3 April 1880, at the Opera Comique, where it ran for 363 performances.\\n\\nThe story concerns Frederic, who, having completed his 21st year, is released from his apprenticeship to a band of tender-hearted pirates. He meets the daughters of Major-General Stanley, including Mabel, and the two young people fall instantly in love. Frederic soon learns, however, that he was born on the 29th of February, and so, technically, he has a birthday only once each leap year. His indenture specifies that he remain apprenticed to the pirates until his \"twenty-first birthday\", meaning that he must serve for another 63 years. Bound by his own sense of duty, Frederic\\'s only solace is that Mabel agrees to wait for him faithfully.\\n\\nPirates was the fifth Gilbert and Sullivan collaboration and introduced the much-parodied \"Major-General\\'s Song\". The opera was performed for over a century by the D\\'Oyly Carte Opera Company in Britain and by many other opera companies and repertory companies worldwide. Modernized productions include Joseph Papp\\'s 1981 Broadway production, which ran for 787 performances, winning the Tony Award for Best Revival and the Drama Desk Award for Outstanding Musical, and spawning many imitations and a 1983 film adaptation. Pirates remains popular today, taking its place along with The Mikado and H.M.S. Pinafore as one of the most frequently played Gilbert and Sullivan operas.\\n\\nBackground\\n\\nThe Pirates of Penzance was the only Gilbert and Sullivan opera to have its official premiere in the United States. At the time, American law offered no copyright protection to foreigners. After the pair\\'s previous opera, H.M.S. Pinafore, achieved success in London in 1878, approximately 150 American companies quickly mounted unauthorised productions that often took considerable liberties with the text and paid no royalties to the creators. Gilbert and Sullivan hoped to forestall further \"copyright piracy\" by mounting the first production of their next opera in America, before others could copy it, and by delaying publication of the score and libretto. They succeeded in keeping for themselves the direct profits of the first American production of The Pirates of Penzance by opening the production themselves on Broadway, prior to the London production, and they also operated profitable US touring companies of Pirates and Pinafore. However, Gilbert, Sullivan, and their producer, Richard D\\'Oyly Carte, failed in their efforts, over the next decade, to control the American performance copyrights  to Pirates and their other operas.\\n\\nFiction and plays about pirates were ubiquitous in the 19th century. Walter Scott\\'s The Pirate (1822) and James Fenimore Cooper\\'s The Red Rover were key sources for the romanticised, dashing pirate image and the idea of repentant pirates. Both Gilbert and Sullivan had parodied these ideas early in their careers. Sullivan had written a comic opera called The Contrabandista, in 1867, about a hapless British tourist who is captured by bandits and forced to become their chief. Gilbert had written several comic works that involved pirates or bandits. In Gilbert\\'s 1876 opera Princess Toto, the title character is eager to be captured by a brigand chief. Gilbert had translated Jacques Offenbach\\'s operetta Les brigands, in 1871. As in Les brigands, The Pirates of Penzance absurdly treats stealing as a professional career path, with apprentices and tools of the trade such as the crowbar and life preserver.\\n\\nGenesis\\nWhile Pinafore was running strongly at the Opera Comique in London, Gilbert was eager to get started on his and Sullivan\\'s next opera, and he began working on the libretto in December 1878. He re-used several elements of his 1870 one-act piece, Our Island Home, which had introduced a pirate \"chief\", Captain Bang. Bang was mistakenly apprenticed to a pirate band as a child by his deaf nursemaid. Also, Bang, like Frederic in The Pirates of Penzance, had never seen a woman before and felt a keen sense of duty, as an apprenticed pirate, until the passage of his twenty-first birthday freed him from his articles of indenture. Bernard Shaw believed that Gilbert drew on ideas in Les brigands for his new libretto, including the businesslike bandits and the bumbling police. Gilbert and Sullivan also inserted into Act II an idea they first considered for a one-act opera parody in 1876 about burglars meeting police, while their conflict escapes the notice of the oblivious father of a large family of girls. As in Pinafore, \"there was a wordful self-descriptive set-piece for Stanley [\"The Major-General\\'s Song\"], introducing himself much as Sir Joseph Porter had done ... a lugubrious comic number for the Sergeant of Police ... a song of confession for Ruth, the successor [to] Little Buttercup\", romantic material for Frederic and Mabel, and \"ensemble and chorus music in turn pretty, parodic and atmospheric.\"\\n\\nGilbert, Sullivan and Carte met by 24 April 1879 to make plans for a production of Pinafore and the new opera in America. Carte travelled to New York in the summer of 1879 and made arrangements with theatre manager John T. Ford to present, at the Fifth Avenue Theatre, the authorised productions. He then returned to London. Meanwhile, once Pinafore became a hit in London, the author, composer and producer had the financial resources to produce future shows themselves, and they executed a plan to free themselves from their financial backers in the \"Comedy Opera Company\". Carte formed a new partnership with Gilbert and Sullivan to divide profits equally among themselves after the expenses of each of their shows.\\n\\nIn November 1879, Gilbert, Sullivan and Carte sailed to America with a company of singing actors, to play both Pinafore and the new opera, including J. H. Ryley as Sir Joseph, Blanche Roosevelt as Josephine, Alice Barnett as Little Buttercup, Furneaux Cook as Dick Deadeye, Hugh Talbot as Ralph Rackstraw and Jessie Bond as Cousin Hebe, some of whom had been in the Pinafore cast in London. To these, he added some American singers, including Signor Brocolini as Captain Corcoran. Alfred Cellier came to assist Sullivan, while his brother François Cellier remained in London to conduct Pinafore there. Gilbert and Sullivan cast talented actors who were not well-known stars and did not command high fees. They then tailored their operas to the particular abilities of these performers. The skill with which Gilbert and Sullivan used their performers had an effect on the audience: as critic Herman Klein wrote, \"we secretly marvelled at the naturalness and ease with which [the Gilbertian quips and absurdities] were said and done. For until then no living soul had seen upon the stage such weird, eccentric, yet intensely human beings\\xa0.... [They] conjured into existence a hitherto unknown comic world of sheer delight.\" Gilbert acted as stage director for his own plays and operas. He sought naturalism in acting, which was unusual at the time, just as he strove for realistic visual elements. He deprecated self-conscious interaction with the audience and insisted on a style of portrayal in which the characters were never aware of their own absurdity but were coherent internal wholes. Sullivan conducted the music rehearsals.\\n\\nSullivan had sketched out the music for Pirates in England. When he arrived in New York, however, he found that he had left the sketches for Act I behind, and he had to reconstruct the first act from memory, or compose new numbers. Gilbert told a correspondent many years later that Sullivan was unable to recall his setting of the entrance of the women\\'s chorus, so they substituted the chorus \"Climbing over rocky mountain\" from their earlier opera, Thespis. Sullivan\\'s manuscript for Pirates contains pages removed from a Thespis score, with the vocal parts of this chorus altered from their original arrangement as a four-part chorus. Some scholars (e.g. Tillett and Spencer, 2000) have suggested that Gilbert and Sullivan had planned all along to re-use \"Climbing over rocky mountain,\" and perhaps other parts of Thespis. They argue that Sullivan\\'s having brought the unpublished Thespis score to New York, when there were no plans to revive Thespis, might not have been accidental. In any case, on 10 December 1879, Sullivan wrote a letter to his mother about the new opera, upon which he was hard at work in New York. \"I think it will be a great success, for it is exquisitely funny, and the music is strikingly tuneful and catching.\" As was his usual practice in his operas, Sullivan left the overture for the last moment, often sketching it out and entrusting completion of \"the details\" to an assistant, in this case the company\\'s music director, Alfred Cellier.\\n\\nPinafore opened in New York on 1 December 1879 and ran for the rest of December. After a reasonably strong first week, audiences quickly fell off, since most New Yorkers had already seen local productions of Pinafore. In the meantime, Gilbert and Sullivan raced to complete and rehearse The Pirates of Penzance. The work\\'s title is a multi-layered joke. On the one hand, Penzance was a docile seaside resort in 1879, and not the place where one would expect to encounter pirates. On the other hand, the title was also a jab at the theatrical \"pirates\" who had staged unlicensed productions of H.M.S. Pinafore in America. To secure the British copyright, a D\\'Oyly Carte touring company gave a perfunctory copyright performance of Pirates the afternoon before the New York premiere, at the Royal Bijou Theatre in Paignton, Devon, organised by Helen Lenoir, who would later marry Richard D\\'Oyly Carte. The cast, which was performing Pinafore in the evenings in Torquay, received some of the music for Pirates only two days beforehand. Having had only one rehearsal, they travelled to nearby Paignton for the matinee, where they read their parts from scripts carried onto the stage, making do with whatever costumes they had on hand.\\n\\nOriginal production and aftermath\\n\\nPirates opened on 31 December 1879 in New York and was an immediate hit. On 2 January 1880, Sullivan wrote, in another letter to his mother from New York, \"The libretto is ingenious, clever, wonderfully funny in parts, and sometimes brilliant in dialogue – beautifully written for music, as is all Gilbert does. ... The music is infinitely superior in every way to the Pinafore – \\'tunier\\' and more developed, of a higher class altogether. I think that in time it will be very popular.\" Shortly thereafter, Carte sent three touring companies around the United States East Coast and Midwest, playing Pirates and Pinafore. Sullivan\\'s prediction was correct. After a strong run in New York and several American tours, Pirates opened in London on 3 April 1880, running for 363 performances there. It remains one of the most popular G&S works. The London sets were designed by John O\\'Connor.\\n\\nThe critics\\' notices were generally excellent in both New York and London.  The character of Major-General Stanley was widely taken to be a caricature of the popular general Sir Garnet Wolseley. The biographer Michael Ainger, however, doubts that Gilbert intended a caricature of Wolseley, identifying instead General Henry Turner, uncle of Gilbert\\'s wife, as the pattern for the \"modern Major-General\". Gilbert disliked Turner, who, unlike the progressive Wolseley, was of the old school of officers. Nevertheless, in the original London production, George Grossmith imitated Wolseley\\'s mannerisms and appearance, particularly his large moustache, and the audience recognised the allusion. Wolseley himself, according to his biographer, took no offence at the caricature and sometimes sang \"I am the very model of a modern Major-General\" for the private amusement of his family and friends.\\n\\nRoles\\n Major-General Stanley (comic baritone)\\n The Pirate King (bass-baritone)\\n Samuel, his Lieutenant (baritone)\\n Frederic, the Pirate Apprentice (tenor)\\n Sergeant of Police (bass)\\nGeneral Stanley\\'s daughters\\n Mabel (soprano)\\n Edith (mezzo-soprano)\\n Kate (mezzo-soprano)\\n Isabel (speaking role)\\n Ruth, a Piratical Maid of all work (contralto)\\n Chorus of Pirates, Police and General Stanley\\'s Daughters\\n\\nSynopsis\\n\\nAct I\\nOn the coast of Cornwall, during Queen Victoria\\'s reign, Frederic celebrates the completion of his twenty-first year and the end of his apprenticeship to a gentlemanly band of pirates (\"Pour, oh pour the pirate sherry\"). The pirates\\' maid of all work, Ruth, appears and reveals that, as Frederic\\'s nursemaid long ago, she made a mistake \"through being hard of hearing\": Mishearing Frederic\\'s father\\'s instructions, she apprenticed him to a pirate, instead of to a ship\\'s pilot (\"When Frederic was a little lad\").\\n\\nFrederic has never seen any woman other than Ruth, and he believes her to be beautiful. The pirates know better and suggest that Frederic take Ruth with him when he returns to civilisation. Frederic announces that, although it pains him, so strong is his sense of duty that, once free from his apprenticeship, he will be forced to devote himself to the pirates\\' extermination. He also points out that they are not successful pirates: since they are all orphans, they allow their prey to go free if they too are orphans. Frederic notes that word of this has got about, so captured ships\\' companies routinely claim to be orphans. Frederic invites the pirates to give up piracy and go with him, so that he need not destroy them, but the Pirate King says that, contrasted with respectability, piracy is comparatively honest (\"Oh! better far to live and die\"). The pirates depart, leaving Frederic and Ruth. Frederic sees a group of beautiful young girls approaching the pirate lair, and realises that Ruth misled him about her appearance (\"Oh false one! You have deceived me!\"). Sending Ruth away, Frederic hides before the girls arrive.\\n\\nThe girls burst exuberantly upon the secluded spot (\"Climbing over rocky mountain\"). Frederic reveals himself (\"Stop, ladies, pray!\"), startling them. He appeals to them to help him reform (\"Oh! is there not one maiden breast?\"). The girls are fascinated by him, but all reject him, except one: Mabel responds to his plea, chiding her sisters for their lack of charity (\"Oh sisters deaf to pity\\'s name for shame!\"). She offers Frederic her pity (\"Poor wand\\'ring one\"), and the two quickly fall in love. The other girls discuss whether to eavesdrop or to leave the new couple alone (\"What ought we to do?\"), deciding to \"talk about the weather,\" although they steal glances at the affectionate couple (\"How beautifully blue the sky\").\\n\\nFrederic warns the young ladies that his old associates will soon return (\"Stay, we must not lose our senses\"), but before they can flee, the pirates arrive and capture the girls, intending to marry them (\"Here\\'s a first rate opportunity\"). Mabel warns the pirates that the girls\\' father is a Major-General (\"Hold, monsters!\"), who soon arrives and introduces himself (\"I am the very model of a modern Major-General\"). He appeals to the pirates not to take his daughters, leaving him to face his old age alone. Having heard of the famous Pirates of Penzance, he pretends that he is an orphan to elicit their sympathy (\"Oh, men of dark and dismal fate\"). The soft-hearted pirates release the girls (\"Hail, Poetry!\"), making Major-General Stanley and his daughters honorary members of their band (\"Pray observe the magnanimity\").\\n\\nAct II\\nThe Major-General sits in a ruined chapel on his estate, surrounded by his daughters. His conscience is tortured by the lie that he told the pirates, and the girls attempt to console him (\"Oh dry the glist\\'ning tear\"). The Sergeant of Police and his corps arrive to announce their readiness to arrest the pirates (\"When the foeman bares his steel\"). The girls loudly express their admiration of the police for facing likely slaughter by fierce and merciless foes. The police are unnerved by this and leave reluctantly.\\n\\nLeft alone, Frederic, who is to lead the police, reflects on his opportunity to atone for a life of piracy (\"Now for the pirates\\' lair\"), at which point he encounters Ruth and the Pirate King. They have realised that Frederic\\'s apprenticeship was worded so as to bind him to them until his twenty-first birthday – and, because that birthday happens to be on the 29th of February (in a leap year), it means that technically only five birthdays have passed (\"When you had left our pirate fold\"), and he will not reach his twenty-first birthday until he is in his eighties. Frederic is convinced by this logic and agrees to rejoin the pirates.  He then sees it as his duty to inform the Pirate King of the Major-General\\'s deception. The outraged outlaw declares that the pirates\\' \"revenge will be swift and terrible\" (\"Away, away, my heart\\'s on fire\").\\n\\nFrederic meets Mabel (\"All is prepared\"), and she pleads with him to stay (\"Stay Frederic, stay\"), but he feels bound by his duty to the pirates until his 21st birthday – in 1940. They agree to be faithful to each other until then, though to Mabel \"It seems so long\" (\"Oh, here is love, and here is truth\"); Frederic departs. Mabel steels herself (\"No, I\\'ll be brave\") and tells the police that they must go alone to face the pirates. They muse that an outlaw might be just like any other man, and it is a shame to deprive him of \"that liberty which is so dear to all\" (\"When a felon\\'s not engaged in his employment\"). The police hide on hearing the approach of the pirates (\"A rollicking band of pirates we\"), who have stolen onto the estate, intending to take revenge for the Major-General\\'s lie (\"With cat-like tread\").\\n\\nJust then, Major-General Stanley appears, sleepless with guilt, and the pirates also hide (\"Hush, hush! not a word\"), while the Major-General listens to the soothing breeze (\"Sighing softly to the river\"). The girls come looking for him. The pirates leap out to seize them, and the police rush to their defense; but the police are easily defeated, and the Pirate King urges the captured Major-General to prepare for death. The Sergeant has one stratagem left: he demands that the pirates yield \"in Queen Victoria\\'s name\"; the pirates, overcome with loyalty to their Queen, do so. Ruth appears and reveals that the pirates are \"all noblemen who have gone wrong\". The Major-General is impressed by this and all is forgiven. Frederic and Mabel are reunited, and the Major-General is happy to marry his daughters to the noble ex-pirates after all (\"Poor Wand\\'ring Ones\" (reprise)).\\n\\nMusical numbers\\n Overture (includes \"With cat-like tread\", \"Ah, leave me not to pine\", \"Pray observe the magnanimity\", \"When you had left our pirate fold\", \"Climbing over rocky mountain\", and \"How beautifully blue the sky\")\\n\\nAct I\\n\\n 1. \"Pour, oh pour, the pirate sherry\" (Samuel and Chorus of Pirates)\\n 2. \"When Fred\\'ric was a little lad\" (Ruth)\\n 3. \"Oh, better far to live and die\" (Pirate King and Chorus of Pirates)\\n 4. \"Oh! false one, you have deceiv\\'d me\" (Frederic and Ruth)\\n 5. \"Climbing over rocky mountain\" (Chorus of Girls)\\n 6. \"Stop, ladies, pray\" (Edith, Kate, Frederic, and Chorus of Girls)\\n 7. \"Oh, is there not one maiden breast?\" (Frederic and Chorus of Girls)\\n 8. \"Poor wand\\'ring one\" (Mabel and Chorus of Girls)\\n 9. \"What ought we to do?\" (Edith, Kate, and Chorus of Girls)\\n 10. \"How beautifully blue the sky\" (Mabel, Frederic, and Chorus of Girls)\\n 11. \"Stay, we must not lose our senses\" ... \"Here\\'s a first-rate opportunity to get married with impunity\" (Frederic and Chorus of Girls and Pirates)\\n 12. \"Hold, monsters\" (Mabel, Major-General, Samuel, and Chorus)\\n 13. \"I am the very model of a modern Major-General\" (Major-General and Chorus)\\n 14. Finale Act I (Mabel, Kate, Edith, Ruth, Frederic, Samuel, King, Major-General, and Chorus)\\n \"Oh, men of dark and dismal fate\"\\n \"I’m telling a terrible story\"\\n \"Hail, Poetry\"\\n \"Oh, happy day, with joyous glee\"\\n \"Pray observe the magnanimity\" (reprise of \"Here\\'s a first-rate opportunity\")\\n\\nAct II\\n 15. \"Oh, dry the glist\\'ning tear\" (Mabel and Chorus of Girls)\\n 16. \"Then, Frederic, let your escort lion-hearted\" (Frederic and Major-General)\\n 17. \"When the foeman bares his steel\" (Mabel, Edith, Sergeant, and Chorus of Policemen and Girls)\\n 18. \"Now for the pirates\\' lair!\" (Frederic, Ruth, and King)\\n 19. \"When you had left our pirate fold\" [The \"paradox\" trio] (Ruth, Frederic, and King)\\n 20. \"Away, away! My heart\\'s on fire!\" (Ruth, Frederic, and King)\\n 21. \"All is prepar\\'d; your gallant crew await you\" (Mabel and Frederic)\\n 22. \"Stay, Fred\\'ric, stay\" ... \"Ah, leave me not to pine\" ... \"Oh, here is love, and here is truth\" (Mabel and Frederic)\\n 23. \"No, I\\'ll be brave\" ... \"Though in body and in mind\" (Reprise of \"When the foeman bares his steel\") (Mabel, Sergeant, and Chorus of Police)\\n 23a. \"Sergeant, approach!\" (Mabel, Sergeant of Police, and Chorus of Police)\\n 24. \"When a felon\\'s not engaged in his employment\" (Sergeant and Chorus of Police)\\n 25. \"A rollicking band of pirates we\" (Sergeant and Chorus of Pirates and Police)\\n 26. \"With cat-like tread, upon our prey we steal\" (Samuel and Chorus of Pirates and Police)\\n 27. \"Hush, hush, not a word!\" (Frederic, King, Major-General, and Chorus of Police and Pirates)\\n 28. Finale, Act II (Ensemble)\\n \"Sighing softly to the river\"\\n \"Now what is this, and what is that?\"\\n \"You/We triumph now\"\\n \"Away with them, and place them at the bar!\"\\n \"Poor wandering ones!\"\\n\\nCritical reception\\nThe notices from critics were generally excellent in both New York and London in 1880. In New York, the Herald and the Tribune both dedicated considerable space to their reviews. The Herald took the view that \"the new work is in every respect superior to the Pinafore, the text more humorous, the music more elegant and more elaborate.\" The Tribune called it \"a brilliant and complete success\", commenting, \"The humor of the Pirates is richer, but more recondite. It demands a closer attention to the words [but] there are great stores of wit and drollery ... which will well repay exploration. ... The music is fresh, bright, elegant and merry, and much of it belongs to a higher order of art than the most popular of the tunes of Pinafore.\" The New York Times also praised the work, writing, \"it would be impossible for a confirmed misanthrope to refrain from merriment over it\", though the paper doubted if Pirates could repeat the prodigious success of Pinafore.\\n\\nAfter the London premiere, the critical consensus, led by the theatrical newspaper The Era, was that the new work marked a distinct advance on Gilbert and Sullivan\\'s earlier works. The Pall Mall Gazette said, \"Of Mr. Sullivan\\'s music we must speak in detail on some other occasion. Suffice it for the present to say that in the new style which he has marked out for himself it is the best he has written.\" The Graphic wrote:\\n\\nThere were a few dissenting comments: The Manchester Guardian thought both author and composer had drawn on the works of their predecessors: \"Mr. Gilbert ... seems to have borrowed an idea from Sheridan\\'s The Critic; Mr. Sullivan\\'s music is sprightly, tuneful and full of \\'go\\', although it is certainly lacking in originality.\" The Sporting Times noted, \"It doesn\\'t appear to have struck any of the critics yet that the central idea in The Pirates of Penzance is taken from Our Island Home, which was played by the German Reeds some ten years ago.\" The Times thought Gilbert\\'s wit outran his dramatic invention, and Sullivan\\'s music for the new work was not quite as good as his score for The Sorcerer, which the Times critic called a masterpiece.\\n\\nMusical analysis\\nThe overture to The Pirates of Penzance was composed by Sullivan and his musical assistant Alfred Cellier. It follows the pattern of most Savoy opera overtures: a lively opening (the melody of \"With cat-like tread\"), a slow middle section (\"Ah, leave me not to pine alone\"), and a concluding allegro in a compressed sonata form, in which the themes of \"How beautifully blue the sky\" and \"A paradox, a paradox\" are combined.\\n\\nParody\\nThe score parodies several composers, most conspicuously Verdi. \"Come, friends, who plough the sea\" and \"You triumph now\" are burlesques of Il trovatore, and one of the best-known choral passages from the finale to Act\\xa0I, \"Hail Poetry\", is, according to the Sullivan scholar, Arthur Jacobs, a burlesque of the prayer scene, \"La Vergine degli Angeli\", in Verdi\\'s La forza del destino. However, another musicologist, Nicholas Temperley, writes, \"The choral outburst \\'Hail, Poetry\\' in The Pirates of Penzance would need very little alteration to turn it into a Mozart string quartet.\" Another well-known parody number from the work is the song for coloratura, \"Poor wand\\'ring one\", which is generally thought to burlesque Gounod\\'s waltz-songs, though the music critic of The Times called it \"mock-Donizetti\".  In a scene in Act\\xa0II, Mabel addresses the police, who chant their response in the manner of an Anglican church service.\\n\\nSullivan even managed to parody two composers at once. The critic Rodney Milnes describes the Major-General\\'s Act\\xa0II song, \"Sighing softly to the river\", \"as plainly inspired by – and indeed worthy of – Sullivan\\'s hero Schubert\", and Amanda Holden speaks of the song\\'s \"Schubertian water-rippling accompaniment\", but adds that it simultaneously spoofs Verdi\\'s Il trovatore, with the soloist unaware of a concealed male chorus singing behind him.\\n\\nPatter, counterpoint, and vocal writing\\n\\nWriting about patter songs, Shaw, in his capacity as a music critic, praised \"the time-honored lilt which Sir Arthur Sullivan, following the example of Mozart and Rossini, chose for the lists of accomplishments of the Major-General in The Pirates or the Colonel in Patience.\"\\n\\nThis opera contains two well-known examples of Sullivan\\'s characteristic combination of two seemingly disparate melodies. Jacobs suggests that Berlioz\\'s La damnation de Faust, a great favourite in Sullivan\\'s formative years, may have been the model for Sullivan\\'s trademark contrapuntal mingling of the rapid prattle of the women\\'s chorus in Act I (\"How beautifully blue the sky\") in 2/4 time with the lovers\\' duet in waltz time. Jacobs writes that \"the whole number [shifts] with Schubertian ease from B to G and back again.\" In Act II, a double chorus combines the policemen\\'s dogged tune, \"When the foeman bares his steel\" and the soaring line for the women, \"Go, ye heroes, go to glory\". In adapting the four-part chorus \"Climbing over rocky mountain\" from Thespis for re-use in Pirates, Sullivan took less trouble: he wrote only a single vocal line, suitable for soprano voices. Despite this, the number ends with another example of Sullivan\\'s counterpoint, with the chorus singing the second melody of the piece (\"Let us gaily tread the measure\") while the orchestra plays the first (\"Climbing over rocky mountain\").\\n\\nSullivan set a particular vocal challenge for the soprano who portrays Mabel. The Sullivan scholar Gervase Hughes wrote, \"Mabel ... must be a coloratura because of \\'Poor wand\\'ring one!\\', yet \\'Dear father, why leave your bed\\' demands steady beauty of tone throughout the octave F to F, and \\'Ah, leave me not to pine\\' goes a third lower still.\" In The Music of Arthur Sullivan (1959), Hughes quoted four extracts from Pirates, saying that if hearing each out of context one might attribute it to Schubert, Mendelssohn, Gounod or Bizet respectively, \"yet on learning the truth one would kick oneself for not having recognised Sullivan\\'s touch in all four.\" Hughes concluded by quoting the introductory bars of \"When a felon\\'s not engaged in his employment\", adding, \"There could never be any doubt as to who wrote that, and it is as English as our wonderful police themselves.\"\\n\\nVersions\\n\\nBecause the work was premiered in three different places (the Paignton performance and the full productions in New York and London), there are more variations in the early libretto and score of The Pirates of Penzance than in other Gilbert and Sullivan works. Songs sent from New York to the D\\'Oyly Carte touring company in England for the Paignton premiere were then altered or omitted during Broadway rehearsals. Gilbert and Sullivan trimmed the work for the London premiere, and Gilbert made further alterations up to and including the 1908 Savoy revival. For example, early versions depicted the Pirate King as the servant of the pirate band, and the words of the opening chorus were, \"Pour, O King, the pirate sherry\". In the original New York production the revelation by Ruth that the pirates are \"all noblemen who have gone wrong\" prompted the following exchange (recalling a famous passage in H.M.S. Pinafore):\\n\\nIn the original London production, this exchange was shortened to the following:\\n\\nGilbert deleted the exchange in the 1900 revival, and the Chappell vocal score was revised accordingly. For the 1908 revival Gilbert had the pirates yielding \"in good King Edward\\'s name\". Despite Helen Carte\\'s repeated urging, Gilbert did not prepare an authorised version of the libretti of the Savoy operas.\\n\\nIn its 1989 production, the D\\'Oyly Carte Opera Company restored one of the original versions of the finale, which finishes with a variation of \"I am the very model of a modern major-general\", rather than with the customary reprise of \"Poor wand\\'ring one\", but in later revivals, it reverted to the more familiar text.\\n\\nProduction history\\n\\nThe Pirates of Penzance has been one of Gilbert and Sullivan\\'s most popular comic operas. After its unique triple opening in 1879–80, it was revived in London at the Savoy Theatre in 1888 and in 1900, and for the Savoy\\'s repertory season of 1908–09. In the British provinces, the D\\'Oyly Carte Opera Company toured it almost continuously from 1880–1884, and again in 1888. It re-entered the D\\'Oyly Carte touring repertory in 1893 and was never again absent until the company\\'s closure in 1982. New costumes were designed by Percy Anderson in 1919 and George Sheringham in 1929 (who also executed a new Act I set). Peter Goffin created a new touring set in 1957.\\n\\nIn America, after the New York opening on New Year\\'s Eve, 1879, Richard D\\'Oyly Carte launched four companies that covered the United States on tours that lasted through the following summer. Gilbert and Sullivan themselves trained each of the touring companies through January and early February 1880, and each company\\'s first performance – whether it was in Philadelphia, Newark, or Buffalo – was conducted by the composer. In Australia, its first authorised performance was on 19 March 1881 at the Theatre Royal, Sydney, produced by J. C. Williamson. There was still no international copyright law in 1880, and the first unauthorised New York production was given by the Boston Ideal Opera Company at Booth\\'s Theatre in September of that year. The opera premiered in a German translation by Richard Genée and Camillo Walzel (Die Piraten) in Austria at the Theater an der Wien on 1 March 1889, and in Düsseldorf, Germany, on 1 December 1936.\\n\\nThe first non-D\\'Oyly Carte professional production in a country that had been subject to Gilbert\\'s copyright (other than Williamsons\\' authorised productions) was in Stratford, Ontario, Canada, in September 1961, as the copyright expired. In 1979, the Torbay branch of the Gilbert and Sullivan Society presented a centenary tribute to the world premiere performance of Pirates in Paignton, with a production at the Palace Avenue Theatre (situated a few metres from the former Bijou Theatre).\\n\\nNew York has seen over forty major revivals since the premiere. One of these, produced and directed by Winthrop Ames in 1926 at the Plymouth Theatre, ran for 128 performances and gained good notices. A brief 1952 Broadway staging starring Martyn Green, earned Lehman Engel a Tony Award as conductor. Repertory companies that have mounted Pirates numerous times Off-Broadway and on tour in the US have included the American Savoyards (1953–67), the Light Opera of Manhattan (1968–89) and the New York Gilbert and Sullivan Players (1976–present).\\n\\nAs discussed below, Joseph Papp\\'s 1980–83 Pirates ran for nearly two years each on Broadway and in the West End, boosting the opera\\'s popularity. Professional and amateur productions of the opera continue with frequency. For example, the Chicago Lyric Opera and English National Opera each staged the work in 2004, and in 2007, the New York City Opera and Opera Australia both mounted new productions. In 2013, Scottish Opera produced a British touring production of The Pirates of Penzance co-produced by the trustees of the D\\'Oyly Carte Opera Company. Richard Suart played Major-General Stanley and Nicholas Sharratt played Frederic.\\n\\nThe following table shows the history of the D\\'Oyly Carte productions in Gilbert\\'s lifetime (excluding tours):\\n\\nHistorical casting\\nThe following tables show the casts of the principal original productions and D\\'Oyly Carte Opera Company touring repertory at various times through to the company\\'s 1982 closure:\\n\\nJoseph Papp\\'s Pirates\\n\\nIn 1980, Joseph Papp and the Public Theater of New York City produced a new version of Pirates, directed by Wilford Leach and choreographed by Graciela Daniele, at the Delacorte Theatre in Central Park, as a Shakespeare in the Park summer event. Musical direction and arrangements were by William Elliott. The show played for 10 previews and 35 performances. It then transferred to Broadway, opening on 8 January 1981 for a run of 20 previews and 787 regular performances at the Uris and Minskoff Theatres, the longest run for any Gilbert and Sullivan production in history. This take on Pirates earned enthusiastic reviews and seven Tony Award nominations, winning three, including the award for Best Revival and for Leach as director. It was also nominated for eight Drama Desk Awards, winning five, including Outstanding Musical and director.\\n\\nCompared with traditional productions of the opera, Papp\\'s Pirates featured a more swashbuckling Pirate King and Frederic, and a broader, more musical comedy style of singing and humour. It did not significantly change the libretto, but it used a new orchestration and arrangements that changed some keys, added repeats, lengthened dance music and made other minor changes in the score. The \"Matter Patter\" trio from Ruddigore and \"Sorry her lot\" from H.M.S. Pinafore, two other Gilbert and Sullivan operas, were interpolated into the show. The production also restored Gilbert and Sullivan\\'s original New York ending, with a reprise of the Major-General\\'s song in the Act II finale. Linda Ronstadt starred as Mabel, Rex Smith as Frederic, Kevin Kline as the Pirate King, Patricia Routledge as Ruth (replaced by Estelle Parsons for the Broadway transfer), George Rose as the Major-General, and Tony Azito as the Sergeant of Police. Kline won a Tony Award for his performance. Smith won a Theatre World Award, and Kline and Azito won Drama Desk Awards. Notable replacements during the Broadway run included Karla DeVito, Maureen McGovern and Pam Dawber as Mabel; Robby Benson, Patrick Cassidy and Peter Noone as Frederic; Treat Williams, Gary Sandy, James Belushi and Wally Kurth as the Pirate King; David Garrison as the Sergeant; George S. Irving as the Major-General; and Kaye Ballard as Ruth. The Los Angeles cast of the production featured Barry Bostwick as the Pirate King, Jo Anne Worley as Ruth, Clive Revill as the Major-General, Dawber as Mabel, Paxton Whitehead as the Sergeant, Caroline Peyton as Edith and Andy Gibb as Frederic.\\n\\nThe production opened at the Theatre Royal, Drury Lane, London, on 26 May 1982, to generally warm reviews, for a run of 601 performances, earning an Olivier Award nomination as Outstanding Musical and another for Curry. Notable among the cast were George Cole and Ronald Fraser as the Major-General; Pamela Stephenson as Mabel; Michael Praed and Peter Noone as Frederic; Tim Curry, Timothy Bentinck, Oliver Tobias and Paul Nicholas as the Pirate King; Chris Langham as the Sergeant of Police; Annie Ross as Ruth; Bonnie Langford as Kate; and Louise Gold as Isabel. The Australian production opened in Melbourne in January 1984, opening the new Victorian Arts Centre, directed by John Feraro. It starred Jon English as the Pirate King, Simon Gallaher as Frederic, June Bronhill as Ruth, David Atkins as the Sergeant of Police and Marina Prior as Mabel. The six-week limited season was followed by an Australian national tour from 1984 to 1986 and another tour with same cast in the mid-1990s. In 1985, Papp\\'s Pirates opened the new Queensland Performing Arts Centre in Brisbane, setting attendance records that were not surpassed until many years later by The Phantom of the Opera. Gallaher\\'s Essgee Entertainment version of Pirates was inspired by the Papp version. The Papp version also inspired foreign-language productions in Germany and elsewhere in Europe.\\n\\nThe Papp production was turned into a film in 1983, with the original Broadway principal cast reprising their roles, except that Angela Lansbury replaced Estelle Parsons as Ruth. The minor roles used British actors miming to their Broadway counterparts. The film has been shown occasionally on television. Another film based loosely on the opera and inspired by the success of the Papp version, The Pirate Movie, was released during the Broadway run.\\n\\nThe Papp production design has been widely imitated in later productions of Pirates, even where traditional orchestration and the standard score are used. Some modern productions are also influenced by the Disney film franchise Pirates of the Caribbean, combining aspects of the Papp production with the Disney design concepts. Not all of these revivals have generated the same enthusiasm as Papp\\'s 1980s productions. A 1999 UK touring production received this critique: \"No doubt when Papp first staged this show in New York and London it had some quality of cheek or chutzpah or pizzazz or irony or something that accounted for its success. But all that\\'s left now ... is a crass Broadway-style musical arrangement ground out by a seven-piece band, and the worst kind of smutty send-up of a historic piece of art.\"\\n\\nRecordings\\nThe Pirates of Penzance has been recorded many times, and the critical consensus is that it has fared well on record. The first complete recording of the score was in 1921, under the direction of Rupert D\\'Oyly Carte, but with established recording singers rather than D\\'Oyly Carte Opera Company performers. In 1929, The Gramophone said of a new set with a mainly D\\'Oyly Carte cast, \"This new recording represents the high-water mark so far as Gilbert and Sullivan opera is concerned. In each of the previous Savoy albums there have been occasional lapses which prevented one from awarding them unqualified praise; but with the Pirates it is happily otherwise; from first to last, and in every bar, a simply delightful production.\" Of later recordings by the D\\'Oyly Carte Opera Company, the 1968 recording (with complete dialogue) is highly regarded: The online Gilbert and Sullivan Discography says, \"This recording is one of the best D\\'Oyly Carte sets of all time, and certainly the best Pirates\", and the Penguin Guide to Opera on Compact Disc also recommends it. So too does the Penguin Guide to Recorded Classical Music, alongside the 1993 Mackerras recording. The opera critic Alan Blyth recommended the D\\'Oyly Carte recording of 1990: \"a performance full of the kind of life that can only come from the experience of stage performances\". The online Discography site also mentions the 1981 Papp recording as \"excellent\", despite its inauthentic 1980 re-orchestrations that \"changed some of the timbres so as to appeal to a rock-oriented public\".\\n\\nOf the available commercial videos, the Discography site considers the Brent Walker better than the Papp version. More recent professional productions have been recorded on video by the International Gilbert and Sullivan Festival.\\n\\nSelected recordings\\n 1929 D\\'Oyly Carte – Conductor: Malcolm Sargent\\n 1957 D\\'Oyly Carte – New Symphony Orchestra of London; Conductor: Isidore Godfrey\\n 1961 Sargent/Glyndebourne – Pro Arte Orchestra, Glyndebourne Festival Chorus; Conductor: Sir Malcolm Sargent\\n 1968 D\\'Oyly Carte (with dialogue) – Royal Philharmonic Orchestra; Conductor: Isidore Godfrey\\n 1981; 1983 Papp\\'s Pirates (with dialogue) – Director: Wilford Leach; Musical Director: William Elliott; Choreographer: Graciela Daniele\\n 1982 Brent Walker Productions (with dialogue) – Ambrosian Opera Chorus, London Symphony Orchestra; Conductor: Alexander Faris; Stage Director: Michael Geliot\\n 1990 New D\\'Oyly Carte – Conductor: John Pryce-Jones\\n 1993 Mackerras/Telarc – Orchestra and Chorus of the Welsh National Opera; Conductor: Sir Charles Mackerras\\n 1994 Essgee Entertainment (video adaptation) – Director and Choreographer: Craig Schaefer; Orchestrator and Conductor: Kevin Hocking; Additional Lyrics: Melvyn Morrow\\n\\nCultural impact\\n\\nMajor-General\\'s Song\\n\\nPirates is one of the most frequently referenced works of Gilbert and Sullivan. The Major-General\\'s Song, in particular, is frequently parodied, pastiched and used in advertising. Parody versions have been used in political commentary as well as entertainment media. Its challenging patter has proved interesting to comedians; notable examples include Tom Lehrer\\'s song \"The Elements\" and David Hyde Pierce\\'s monologue, as host of Saturday Night Live. In 2010, comedian Ron Butler released a YouTube pastiche of the song in character as President Obama which, as of September 2021, had garnered more than 1.9 million views.\\n\\nPastiche examples include the Animaniacs version, \"I am the very model of a cartoon individual\", in the episode \"H.M.S. Yakko\"; the Doctor Who audio \"I am the very model of a Gallifreyan buccaneer\" in Doctor Who and the Pirates; the Studio 60 on the Sunset Strip version in the episode \"The Cold Open\" (2006), where the cast performs \"We\\'ll be the very model of a modern network TV show\"; and the Mass Effect 2 video game version, where the character Mordin Solus sings: \"I am the very model of a scientist Salarian\".\\n\\nThe song is often used in film and on television, unchanged in many instances, as a character\\'s audition piece, or seen in a \"school play\" scene. Examples include a VeggieTales episode entitled \"The Wonderful World of Auto-Tainment!\"; the Frasier episode \"Fathers and Sons\"; The Simpsons episode \"Deep Space Homer\"; and the Mad About You episode \"Moody Blues\", where Paul directs a charity production of Penzance starring his father, Burt, as the Major-General. In The Muppet Show (season 3, episode 4) guest host, comedian Gilda Radner, sings the song with a  talking carrot (Parodying the pilot/pirate confusion in Pirates, Radner had requested a  talking parrot, but was misheard). In an episode of Home Improvement, Al Borland begins to sing the song when tricked into thinking he is in a soundproof booth. In the Babylon 5 episode \"Atonement\", Marcus Cole uses the song to drive Dr Stephen Franklin crazy on a long journey to Mars.\\n\\nExamples of the use of the song in advertising include Martyn Green\\'s pastiche of the song listing all of the varieties of Campbell\\'s Soup and a 2011 Geico commercial in which a couple that wants to save money, but still listen to musicals, finds a roommate, dressed as the Major-General, who awkwardly begins the song while dancing on a coffee table. Gimbels department store had a campaign sung to the tune of the Major-General\\'s Song that began, \"We are the very model of a modern big department store.\" George Washington, in the number \"Right Hand Man\" from the 2015 musical Hamilton by Lin-Manuel Miranda, refers to himself with irony as \"The model of a modern major general\", which he rhymes with \"men are all\" and \"pedestal\". Miranda commented: \"I always felt like ‘mineral’ wasn\\'t the best possible rhyme.\"\\n\\nFilm and television\\nOther film references to Pirates include Kate & Leopold, where there are multiple references, including a scene where Leopold sings \"I Am The Very Model of A Modern Major-General\" while accompanying himself on the piano; and in Pretty Woman, Edward Lewis (Richard Gere) covers a social gaffe by prostitute Vivian Ward (Julia Roberts), who comments that the opera La traviata was so good that she almost \"peed [her] pants\", by saying that she had said that she liked it better than The Pirates of Penzance\". In Walt Disney\\'s cartoon Mickey, Donald, Goofy: The Three Musketeers (2004), there is a performance of Pirates that becomes the setting for the climactic battle between the Musketeers and Captain Pete. Pirates songs sung in the cartoon are \"With cat-like tread\", \"Poor wand\\'ring one\", \"Climbing over rocky mountain\" and the Major-General\\'s song. \"Poor wand\\'ring one\" was used in the movie An American Tail. The soundtrack of the 1992 film The Hand That Rocks the Cradle includes \"Poor Wand\\'ring One\" and \"Oh Dry the Glistening Tear\".\\n\\nTelevision references, in addition to those mentioned above, included the series The West Wing, where Pirates and other Gilbert and Sullivan operas are mentioned in several episodes, especially by Deputy Communications Director, Sam Seaborn, who was recording secretary of his school\\'s Gilbert and Sullivan society. In Studio 60 on the Sunset Strip, a poster from Pirates hangs on Matt Albie\\'s office wall. Both TV series were created by Aaron Sorkin. In the pilot episode of the 2008 CTV series Flashpoint, a police officer and his partner sing the policeman\\'s song. In an Assy McGee episode entitled \"Pegfinger\", Detective Sanchez\\'s wife is a member of a community theatre that performs the opera. In a 1986 episode of the animated television adaptation of The Wind in the Willows entitled A Producer\\'s Lot, several characters put on a production of Pirates. In a 2005 Family Guy episode \"Peter\\'s Got Woods\", Brian Griffin sings \"Sighing Softly\", with Peter Griffin\\'s assistance.  In a 2012 episode, \"Killer Queen\", Peter gives a garbled rendition of the Major-General\\'s Song. In the 2009 Criminal Minds episode \"The Slave of Duty\", Hotch quotes \"Oh dry the glist\\'ning tear\". In the 1992 episode \"The Understudy\" of Clarissa Explains it All, the title character is chosen to understudy Mabel in a school production of Pirates and is unprepared when she must go on; a scene from The Mikado is also heard.\\n\\nOther references\\n\\nOther notable instances of references to Pirates include a New York Times article on 29 February 1940, memorialising that Frederic was finally out of his indentures. Six years previously, the arms granted to the municipal borough of Penzance in 1934 contain a pirate dressed in Gilbert\\'s original costuming, and Penzance had a rugby team called the Penzance Pirates, which is now called the Cornish Pirates. In 1980, Isaac Asimov wrote a short story called \"The Year of the Action\", concerning whether the action of Pirates took place on 1 March 1873, or 1 March 1877 (depending on whether Gilbert took into account the fact that 1900 was not a leap year). The plot of Laurie R. King\\'s 2011 novel Pirate King centers on a 1924 silent movie adaptation of The Pirates of Penzance.\\n\\nThe music from the chorus of \"With cat-like tread\", which begins \"Come, friends, who plough the sea,\" was used in the popular American song, \"Hail, Hail, the Gang\\'s All Here.\" \"With cat-like tread\" is also part of the soundtrack, along with other Gilbert and Sullivan songs, in the 1981 film, Chariots of Fire, and it was pastiched in the \"HMS Yakko\" episode of Animaniacs in a song about surfing a whale.\\n\\nAdaptations\\nStage\\n Di Yam Gazlonim, a Yiddish adaptation of Pirates by Al Grand that continues to be performed in North America. The 2006 production at the National Yiddish Theater Folksbiene was nominated for the 2007 Drama Desk Award for Outstanding Revival. The Montreal Express wrote in 2009, \"Grand\\'s adaptation is a delightfully whimsical treatment\".\\n The Parson\\'s Pirates by Opera della Luna premiered in 1995.\\n Pirates! Or, Gilbert and Sullivan Plunder\\'d (2006), is a musical comedy set on a Caribbean island, involving a voodoo curse that makes the pirates \"landsick\". It was first presented 1 November 2006 at Goodspeed Opera House in East Haddam, Connecticut, then in 2007 at the Paper Mill Playhouse in Millburn, New Jersey, in 2009 at the Huntington Theatre Company in Boston, Massachusetts, and at The Muny in St Louis, Missouri in 2012. Other Gilbert and Sullivan numbers, such as the Nightmare song from Iolanthe are interpolated.\\n Pirates of Penzance – The Ballet! premiered in 1991\\n Essgee Entertainment produced an adapted version in 1994 in Australia and New Zealand. Their producer, Simon Gallaher (Frederic in the Australian Papp production), produced another adaptation of Pirates that toured Australia from 2001 to 2003\\n All-male versions of the opera include a long-running adaptation by Sasha Regan at the Union Theatre in 2009, which transferred to Wilton\\'s Music Hall in London in 2010 and toured in Australia in 2012.\\n\\nFilm and TV\\n The Pirate Movie, a 1982 musical romantic comedy film loosely based on the opera.\\n The Pirates of Penzance, a 1983 film adaptation of Papp\\'s Broadway production.\\n Die Piraten, a German-language version, was premiered on German television in 1968 and starred Arleen Auger as Mabel, Gerd Nienstedt as the Pirate King and Martha Mödl as Ruth, with Franz Marszalek conducting. Mabel falls in love with the Pirate King, among other plot changes. A 2-CD set of the broadcast was issued by Gala Records in 2000.\\n Several other television adaptations of the opera have been made, beginning in 1939.\\n\\nSee also\\n Our Island Home, one of the sources of the libretto for Pirates\\n\\nReferences\\n\\nSources\\n \\n \\n \\n  (Chapters 5 and 6)\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n  Also, five supplements, privately printed\\n\\nExternal links\\n\\nGeneral\\n The Pirates of Penzance at The Gilbert & Sullivan Archive\\n Sullivan\\'s autograph manuscript, 1879\\n 1880 London theatre programme\\n Review of the opening night by Clement Scott\\n Papp\\'s version of The Pirates of Penzance at the Music Theatre International website\\n D\\'Oyly Carte Prompt Books at The Victoria and Albert Museum\\n Televised scenes from Pirates, D\\'Oyly Carte Opera Company, 1955\\n \\n\\nLists of productions\\n The Pirates of Penzance. Production list at Floormic.com\\n The Pirates of Penzance at The Internet Broadway Database\\n The Pirates of Penzance at IMDb\\n\\n1879 operas\\nCornwall in fiction\\nDrama Desk Award-winning musicals\\nEnglish comic operas\\nEnglish-language operas\\nNautical fiction\\nOperas adapted into films\\nOperas by Gilbert and Sullivan\\nOperas set in England\\nOperas\\nPenzance\\nPiracy in fiction\\nTony Award-winning musicals'},\n",
+       "  {'docid': 'doc-en-10',\n",
+       "   'text': 'Follies is a musical with music and lyrics by Stephen Sondheim and a book by James Goldman.\\n\\nThe story concerns a reunion in a crumbling Broadway theater, scheduled for demolition, of the past performers of the \"Weismann\\'s Follies\", a musical revue (based on the Ziegfeld Follies), that played in that theater between the world wars. It focuses on two couples, Buddy and Sally Durant Plummer and Benjamin and Phyllis Rogers Stone, who are attending the reunion. Sally and Phyllis were showgirls in the Follies. Both couples are deeply unhappy with their marriages. Buddy, a traveling salesman, is having an affair with a girl on the road; Sally is still as much in love with Ben as she was years ago; and Ben is so self-absorbed that Phyllis feels emotionally abandoned. Several of the former showgirls perform their old numbers, sometimes accompanied by the ghosts of their former selves. The musical numbers in the show have been interpreted as pastiches of the styles of the leading Broadway composers of the 1920s and 1930s, and sometimes as parodies of specific songs.\\n\\nThe Broadway production opened on April 4, 1971, directed by Harold Prince and Michael Bennett, and with choreography by Bennett. The musical was nominated for 11 Tony Awards and won seven. The original production, the second-most costly performed on Broadway to that date, ran for over 500 performances but ultimately lost its entire investment. The musical has had a number of major revivals, and several of its songs have become standards, including \"Broadway Baby\", \"I\\'m Still Here\", \"Too Many Mornings\", \"Could I Leave You?\", and \"Losing My Mind\".\\n\\nBackground\\nAfter the failure of Do I Hear a Waltz? (1965), for which he had written the lyrics to Richard Rodgers\\'s music, Sondheim decided that he would henceforth work only on projects where he could write both the music and lyrics himself. He asked author and playwright James Goldman to join him as bookwriter for a new musical. Inspired by a New York Times article about a gathering of former showgirls from the Ziegfeld Follies, they decided upon a story about ex-showgirls.\\n\\nOriginally titled The Girls Upstairs, the musical was to be produced by David Merrick and Leland Hayward in late 1967, but the plans ultimately fell through, and Stuart Ostrow became the producer, with Joseph Hardy as director. These plans also did not work out, and finally Harold Prince, who had worked previously with Sondheim, became the producer and director. He had agreed to work on The Girls Upstairs if Sondheim agreed to work on Company; Michael Bennett, the young choreographer of Company, was also brought onto the project. It was Prince who changed the title to Follies; he was \"intrigued by the psychology of a reunion of old chorus dancers and loved the play on the word \\'follies.\\n\\nPlot\\nIn 1971, on the soon-to-be-demolished stage of the Weismann Theatre, a reunion is being held to honor the Weismann\\'s Follies shows past and the beautiful chorus girls who performed there every year between the two world wars. The once resplendent theater is now little but planks and scaffolding (\"Prologue\"/\"Overture\"). As the ghosts of the young showgirls slowly drift through the theater, a majordomo enters with his entourage of waiters and waitresses. They pass through the spectral showgirls without seeing them.\\n\\nSally Durant Plummer, \"blond, petite, sweet-faced\" and at 49 \"still remarkably like the girl she was thirty years ago\", a former Weismann girl, is the first guest to arrive, and her ghostly youthful counterpart moves towards her. Phyllis Rogers Stone, a stylish and elegant woman, arrives with her husband Ben, a renowned philanthropist and politician. As their younger counterparts approach them, Phyllis comments to Ben about their past. He feigns a lack of interest; there is an underlying tension in their relationship. As more guests arrive, Sally\\'s husband, Buddy, enters. He is a salesman, in his early 50s, appealing and lively, whose smiles cover inner disappointment.\\n\\nFinally, Weismann enters to greet his guests. Roscoe, the old master of ceremonies, introduces the former showgirls (\"Beautiful Girls\"). Former Weismann performers at the reunion include Max and Stella Deems, who lost their radio jobs and became store owners in Miami; Solange La Fitte, a coquette, who is vibrant and flirtatious even at 66; Hattie Walker, who has outlived five younger husbands; Vincent and Vanessa, former dancers who now own an Arthur Murray franchise; Heidi Schiller, for whom Franz Lehár once wrote a waltz (\"or was it Oscar Straus?\" Facts never interest her; what matters is the song!); and Carlotta Campion, a film star who has embraced life and benefited from every experience.\\n\\nAs the guests reminisce, the stories of Ben, Phyllis, Buddy, and Sally unfold. Phyllis and Sally were roommates while in the Follies, and Ben and Buddy were best friends at school in New York. When Sally sees Ben, her former lover, she greets him self-consciously (\"Don\\'t Look at Me\"). Buddy and Phyllis join their spouses and the foursome reminisces about the old days of their courtship and the theater, their memories vividly coming to life in the apparitions of their young counterparts (\"Waiting For The Girls Upstairs\"). Each of the four is shaken at the realization of how life has changed them. Elsewhere, Willy Wheeler (portly, in his sixties) cartwheels for a photographer. Emily and Theodore Whitman, ex-vaudevillians in their seventies, perform an old routine (\"The Rain on the Roof\"). Solange proves she is still fashionable at what she claims is 66 (\"Ah, Paris!\"), and Hattie Walker performs her old showstopping number (\"Broadway Baby\").\\n\\nBuddy warns Phyllis that Sally is still in love with Ben, and she is shaken by how the past threatens to repeat itself. Sally is awed by Ben\\'s apparently glamorous life, but Ben wonders if he made the right choices and considers how things might have been (\"The Road You Didn\\'t Take\"). Sally tells Ben how her days have been spent with Buddy, trying to convince him (and herself) (\"In Buddy\\'s Eyes\"). However, it is clear that Sally is still in love with Ben – even though their affair ended badly when Ben decided to marry Phyllis. She shakes loose from the memory and begins to dance with Ben, who is touched by the memory of the Sally he once cast aside.\\n\\nPhyllis interrupts this tender moment and has a biting encounter with Sally. Before she has a chance to really let loose, they are both called on to participate in another performance – Stella Deems and the ex-chorines line up to perform an old number (\"Who\\'s That Woman?\"), as they are mirrored by their younger selves. Afterwards, Phyllis and Ben angrily discuss their lives and relationship, which has become numb and emotionless. Sally is bitter and has never been happy with Buddy, although he has always adored her. She accuses him of having affairs while he is on the road, and he admits he has a steady girlfriend, Margie, in another town, but always returns home. Carlotta amuses a throng of admirers with a tale of how her dramatic solo was cut from the Follies because the audience found it humorous, transforming it as she sings it into a toast to her own hard-won survival (\"I\\'m Still Here\").\\n\\nBen confides to Sally that his life is empty. She yearns for him to hold her, but young Sally slips between them and the three move together (\"Too Many Mornings\"). Ben, caught in the passion of memories, kisses Sally as Buddy watches from the shadows. Sally thinks this is a sign that the two will finally get married, and Ben is about to protest until Sally interrupts him with a kiss and runs off to gather her things, thinking that the two will leave together. Buddy leaves the shadows furious, and fantasizes about the girl he should have married, Margie, who loves him and makes him feel like \"a somebody\", but bitterly concludes he does not love her back (\"The Right Girl\"). He tells Sally that he\\'s done, but she is lost in a fantasy world and tells him that Ben has asked her to marry him. Buddy tells her she must be either crazy or drunk, but he\\'s already supported Sally through rehab clinics and mental hospitals and cannot take any more. Ben drunkenly propositions Carlotta, with whom he once had a fling, but she has a young lover and coolly turns him down. Heidi Schiller, joined by her younger counterpart, performs \"One More Kiss\", her aged voice a stark contrast to the sparkling coloratura of her younger self. Phyllis kisses a waiter and confesses to him that she had always wanted a son. She then tells Ben that their marriage can\\'t continue the way it has been. Ben replies by saying that he wants a divorce, and Phyllis assumes the request is due to his love for Sally. Ben denies this, but still wants Phyllis out. Angry and hurt, Phyllis considers whether to grant his request (\"Could I Leave You?\").\\n\\nPhyllis begins wondering at her younger self, who worked so hard to become the socialite that Ben needed. Ben yells at his younger self for not appreciating all the work that Phyllis did. Both Buddys enter to confront the Bens about how they stole Sally. Sally and her younger self enter and Ben firmly tells Sally that he never loved her. All the voices begin speaking and yelling at each other. Suddenly, at the peak of madness and confusion, the couples are engulfed by their follies, which transform the rundown theater into a fantastical \"Loveland\", an extravaganza even more grand and opulent than the gaudiest Weismann confection: \"the place where lovers are always young and beautiful, and everyone lives only for love\". Sally, Phyllis, Ben, and Buddy show their \"real and emotional lives\" in \"a sort of group nervous breakdown\".\\n\\nWhat follows is a series of musical numbers performed by the principal characters, each exploring their biggest desires. The two younger couples sing in a counterpoint of their hopes for the future (\"You\\'re Gonna Love Tomorrow/Love Will See Us Through\"). Buddy then appears, dressed in \"plaid baggy pants, garish jacket, and a shiny derby hat\", and performs a high-energy vaudeville routine depicting how he is caught between his love for Sally and Margie\\'s love for him (\"The God-Why-Don\\'t-You-Love-Me Blues\"). Sally appears next, dressed as a torch singer, singing of her passion for Ben from the past - and her obsession with him now (\"Losing My Mind\"). In a jazzy dance number, accompanied by a squadron of chorus boys, Phyllis reflects on the two sides of her personality, one naive and passionate and the other jaded and sophisticated and her desire to combine them (\"The Story of Lucy and Jessie\"). Resplendent in top hat and tails, Ben begins to offer his devil-may-care philosophy (\"Live, Laugh, Love\"), but stumbles and anxiously calls to the conductor for the lyrics, as he frantically tries to keep going. Ben becomes frenzied, while the dancing ensemble continues as if nothing was wrong. Amidst a deafening discord, Ben screams at all the figures from his past and collapses as he cries out for Phyllis.\\n\\n\"Loveland\" has dissolved back into the reality of the crumbling and half-demolished theater; dawn is approaching. Ben admits to Phyllis his admiration for her, and Phyllis shushes him and helps Ben regain his dignity before they leave. After exiting, Buddy escorts the emotionally devastated Sally back to their hotel with the promise to work things out later. Their ghostly younger selves appear, watching them go. The younger Ben and Buddy softly call to their \"girls upstairs\", and the Follies end.\\n\\nSongs\\nSource: Follies score\\n \"Prologue\" – Orchestra\\n \"Overture\" – Orchestra\\n \"Beautiful Girls\" – Roscoe and Company\\n \"Don\\'t Look at Me\" – Sally and Ben\\n \"Waiting for the Girls Upstairs\" – Ben, Sally, Phyllis and Buddy, Young Ben, Young Sally, Young Phyllis and Young Buddy\\n \"Montage\" (\"Rain on the Roof\"/\"Ah, Paris!\"/\"\") – Emily, Theodore, Solange, and Hattie\\n \"The Road You Didn\\'t Take\" – Ben\\n \"Bolero d\\'Amour\" – Danced by Vincent and Vanessa ≠≠\\n \"In Buddy\\'s Eyes\" – Sally\\n \"Who\\'s That Woman?\" – Stella and Company\\n \"I\\'m Still Here\" – Carlotta\\n \"Too Many Mornings\" – Ben and Sally\\n \"The Right Girl\" – Buddy\\n \"One More Kiss\" – Heidi and Young Heidi\\n \"Could I Leave You?\" – Phyllis\\n \"Loveland\" – Company\\n \"You\\'re Gonna Love Tomorrow\" / \"Love Will See Us Through\" – Young Ben, Young Sally, Young Phyllis and Young Buddy\\n \"The God-Why-Don\\'t-You-Love-Me Blues\" – Buddy, \"Margie\", \"Sally\"\\n \"Losing My Mind\" – Sally\\n \"The Story of Lucy and Jessie\" ≠ – Phyllis and backup male dancers\\n \"Live, Laugh, Love\" – Ben and Company\\n \"Chaos\" – Ben and Company\\n \"Finale\" – Young Buddy and Young Ben\\n≠ Some productions substitute \"Ah, but Underneath\" when the actress portraying Phyllis is not primarily a dancer.\\n\\n≠≠ Omitted from some productions\\n\\nNote: This is the song list from the original Broadway production in 1971. Variations are discussed in Versions.\\n\\nSongs cut before the Broadway premiere include \"All Things Bright and Beautiful\" (used in the prologue), \"Can That Boy Foxtrot!\", \"Who Could Be Blue?\", \"Little White House\", \"So Many People\", \"It Wasn\\'t Meant to Happen\", \"Pleasant Little Kingdom\", and \"Uptown Downtown\". The musical numbers \"Ah, but Underneath\" (replacing \"The Story of Lucy and Jessie\"), \"Country House\", \"Make the Most of Your Music\" (replacing \"Live, Laugh, Love\"), \"Social Dancing\" and a new version of \"Loveland\" have been incorporated into various productions.\\n\\nAnalysis\\nHal Prince said: \"Follies examines obsessive behavior, neurosis and self-indulgence more microscopically than anything I know of.\" Bernadette Peters quoted Sondheim on the character of \"Sally\": \"He said early on that [Sally] is off-balance, to put it mildly. He thinks she\\'s very neurotic, and she is very neurotic, so he said to me \\'Congratulations. She\\'s crazy. Martin Gottfried wrote: \"The concept behind Follies is theatre nostalgia, representing the rose-colored glasses through which we face the fact of age\\xa0... the show is conceived in ghostliness. At its very start, ghosts of Follies showgirls stalk the stage, mythic giants in winged, feathered, black and white opulence. Similarly, ghosts of the Twenties shows slip through the evening as the characters try desperately to regain their youth through re-creations of their performances and inane theatre sentiments of their past.\"\\n\\nJoanne Gordon, author and chair and artistic director, Theatre, at California State University, Long Beach, wrote \"Follies is in part an affectionate look at the American musical theatre between the two World Wars and provides Sondheim with an opportunity to use the traditional conventions of the genre to reveal the hollowness and falsity of his characters\\' dreams and illusions. The emotional high generated by the reunion of the Follies girls ultimately gives way to anger, disappointment, and weary resignation to reality.\" \"Follies contains two scores: the Follies pastiche numbers and the book numbers.\" Some of the Follies numbers imitate the style of particular composers of the early 20th century: \"Losing My Mind\" is in the style of a George Gershwin ballad \"The Man I Love\". Sondheim noted that the song \"The God-Why-Don\\'t-You-Love-Me Blues\" is \"another generic pastiche: vaudeville music for chases and low comics, but with a patter lyric\\xa0... I tried to give it the sardonic knowingness of Lorenz Hart or Frank Loesser.\"\\n\\n\"Loveland\", the final musical sequence, (that \"consumed the last half-hour of the original\" production) is akin to an imaginary 1941 Ziegfeld Follies sequence, with Sally, Phyllis, Ben and Buddy performing \"like comics and torch singers from a Broadway of yore.\" \"Loveland\" features a string of vaudeville-style numbers, reflecting the leading characters\\' emotional problems, before returning to the theater for the end of the reunion party. The four characters are \"whisked into a dream show in which each acts out his or her own principal \\'folly.\\n\\nVersions\\nGoldman continued to revise the book of the musical right up to his death, which occurred shortly before the 1998 Paper Mill Playhouse production. Sondheim, too, has added and removed songs that he judged to be problematic in various productions. Ted Chapin explains: \"Today, Follies is rarely performed twice in exactly the same version. James Goldman\\'s widow made the observation that the show has morphed throughout its entire life\\xa0... The London production had new songs and dialogue. The Paper Mill Playhouse production used some elements from London but stayed close to the original. The 2001 Roundabout Broadway revival, the first major production following Goldman\\'s death in 1998, was again a combination of previous versions.\"\\n\\nMajor changes were made for the original production in London, which attempted to establish a lighter tone and favored a happier ending than the original Broadway production. According to Joanne Gordon, \"When Follies opened in London\\xa0... it had an entirely different, and significantly more optimistic, tone. Goldman\\'s revised book offered some small improvements over the original.\"\\n\\nAccording to Sondheim, the producer Cameron Mackintosh asked for changes for the 1987 London production. \"I was reluctantly happy to comply, my only serious balk being at his request that I cut \"The Road You Didn\\'t Take\"\\xa0... I saw no reason not to try new things, knowing we could always revert to the original (which we eventually did). The net result was four new songs\\xa0... For reasons which I\\'ve forgotten, I rewrote \"Loveland\" for the London production. There were only four showgirls in this version, and each one carried a shepherd\\'s crook with a letter of the alphabet on it.\"\\n\\nThe musical was written in one act, and the original director, Prince, did not want an intermission, while the co-director, Bennett, wanted two acts. It originally was performed in one act. The 1987 West End, 2005 Barrington Stage Company, the 2001 Broadway revival and Kennedy Center 2011 productions were performed in two acts. However, August 23, 2011, Broadway preview performance was performed without an intermission. By opening, the 2011 Broadway revival was performed with the intermission, in two acts. The 2017 National Theatre production is performed without an interval.\\n\\nProductions\\n\\n1971 original Broadway\\nFollies had its pre-Broadway tryout at the Colonial Theatre, Boston, from February 20 through March 20, 1971.\\n\\nFollies premiered on Broadway on April 4, 1971, at the Winter Garden Theatre. It was directed by Harold Prince and Michael Bennett, with choreography by Bennett, scenic design by Boris Aronson, costumes by Florence Klotz, and lighting by Tharon Musser. It starred Alexis Smith (Phyllis), John McMartin (Ben), Dorothy Collins (Sally), Gene Nelson (Buddy), along with several veterans of the Broadway and vaudeville stage. The supporting role of Carlotta was created by Yvonne De Carlo and usually is given to a well-known veteran performer who can belt out a song. Other notable performers in the original productions were Fifi D\\'Orsay as Solange LaFitte, Justine Johnston as Heidi Schiller, Mary McCarty as Stella Deems, Arnold Moss as Dimitri Weismann, Ethel Shutta as Hattie Walker, and Marcie Stringer and Charles Welch as Emily and Theodore Whitman.\\n\\nThe show closed on July 1, 1972, after 522 performances and 12 previews. According to Variety, the production was a \"total financial failure, with a cumulative loss of $792,000.\" Prince planned to present the musical on the West Coast and then on a national tour. However, the show did not do well in its Los Angeles engagement and plans for a tour ended.\\n\\nFrank Rich, for many years the chief drama critic for The New York Times, had first garnered attention, while an undergraduate at Harvard University, with a lengthy essay for the Harvard Crimson about the show, which he had seen during its pre-Broadway run in Boston. He predicted that the show eventually would achieve recognition as a Broadway classic. Rich later wrote that audiences at the original production were baffled and restless.\\n\\nFor commercial reasons, the cast album was cut from two LPs to one early in production. Most songs were therefore heavily abridged and several were left entirely unrecorded. According to Craig Zadan, \"It\\'s generally felt that\\xa0... Prince made a mistake by giving the recording rights of Follies to Capitol Records, which in order to squeeze the unusually long score onto one disc, mutilated the songs by condensing some and omitting others.\" Chapin confirms this: \"Alas\\xa0... final word came from Capitol that they would not go for two records\\xa0... [Dick Jones] now had to propose cuts throughout the score in consultation with Steve.\" \"One More Kiss\" was omitted from the final release but was restored for CD release. Chapin relates that \"there was one song that Dick Jones [producer of the cast album] didn\\'t want to include on the album but which Steve Sondheim most definitely did. The song was \"One More Kiss\", and the compromise was that if there was time, it would be recorded, even if Jones couldn\\'t promise it would end up on the album. (It did get recorded but didn\\'t make its way onto the album until the CD reissue years later.)\"\\n\\n1972 Los Angeles\\nThe musical was produced at The Muny, St. Louis, Missouri in July 1972 and then transferred to the Shubert Theatre, Century City, California, running from July 22, 1972, through October 1, 1972. It was directed by Prince and starred Dorothy Collins (Sally; replaced by Janet Blair), Alexis Smith (Phyllis), John McMartin (Ben; replaced by Edward Winter), Gene Nelson (Buddy), and Yvonne De Carlo (Carlotta) reprising their original roles. The production was the premiere attraction at the newly constructed 1,800-seat theater, which, coincidentally, was itself razed thirty years later (in 2002, in order to build a new office building), thus mirroring the Follies plot line upon which the musical is based.\\n\\n1985 Wythenshawe and Lincoln Center\\nA full production ran at the Forum Theatre, Wythenshawe, England, from April 30, 1985, directed by Howard Lloyd-Lewis, design by Chris Kinman, costumes by Charles Cusick-Smith, lighting by Tim Wratten, musical direction by Simon Lowe, and choreographed by Paul Kerryson. The cast included Mary Millar (Sally Durant Plummer), Liz Izen (Young Sally), Meg Johnson (Stella Deems), Les Want (Max Deems), Betty Benfield (Heidi Schiller), Joseph Powell (Roscoe), Chili Bouchier (Hattie Walker), Shirley Greenwood (Emily Whitman), Bryan Burdon (Theodore Whitman), Monica Dell (Solange LaFitte), Jeannie Harris (Carlotta Campion), Josephine Blake (Phyllis Rogers Stone), Kevin Colson (Ben), Debbie Snook (Young Phyllis), Stephen Hale (Young Ben), Bill Bradley (Buddy Plummer), Paul Burton (Young Buddy), David Scase (Dimitri Weismann), Mitch Sebastian (Young Vincent), Kim Ismay (Young Vanessa), Lorraine Croft (Young Stella), and Meryl Richardson (Young Heidi).\\n\\nA staged concert at Avery Fisher Hall, Lincoln Center, was performed on September 6 and 7, 1985. The concert starred Barbara Cook (Sally), George Hearn (Ben), Mandy Patinkin (Buddy), and Lee Remick (Phyllis), and featured Carol Burnett (Carlotta), Betty Comden (Emily), Adolph Green (Theodore), Liliane Montevecchi (Solange LaFitte), Elaine Stritch (Hattie Walker), Phyllis Newman (Stella Deems), Jim Walton (Young Buddy), Howard McGillin (Young Ben), Liz Callaway (Young Sally), Daisy Prince (Young Phyllis), Andre Gregory (Dmitri), Arthur Rubin (Roscoe), and Licia Albanese (Heidi Schiller). Rich, in his review, noted that \"As performed at Avery Fisher Hall, the score emerged as an original whole, in which the \\'modern\\' music and mock vintage tunes constantly comment on each other, much as the script\\'s action unfolds simultaneously in 1971 (the year of the reunion) and 1941 (the year the Follies disbanded).\"\\n\\nAmong the reasons the concert was staged was to provide an opportunity to record the entire score. The resulting album was more complete than the original cast album. However, director Herbert Ross took some liberties in adapting the book and score for the concert format—dance music was changed, songs were given false endings, the new dialogue was spoken, reprises were added, and Patinkin was allowed to sing \"The God-Why-Don\\'t-You-Love-Me Blues\" as a solo instead of a trio with two chorus girls. Portions of the concert were seen by audiences worldwide in the televised documentary about the making of the concert, also released on videotape and DVD, of \\'Follies\\' in Concert.\\n\\n1987 West End\\n\\nThe musical played in the West End at the Shaftesbury Theatre on July 21, 1987, and closed on February 4, 1989, after 644 performances. The producer was Cameron Mackintosh, the direction was by Mike Ockrent, with choreography by Bob Avian and design by Maria Björnson. The cast featured Diana Rigg (Phyllis), Daniel Massey (Ben), Julia McKenzie (Sally), David Healy (Buddy), Lynda Baron, Leonard Sachs, Maria Charles, Pearl Carr & Teddy Johnson. Dolores Gray was praised as Carlotta, continuing to perform after breaking her ankle, although in a reduced version of the part. During the run, Eartha Kitt replaced Gray, sparking somewhat of a comeback (she went on to perform her own one-woman show at The Shaftesbury Theatre to sell-out houses for three weeks from March 18, 1989, after Follies closed). Other cast replacements included Millicent Martin as Phyllis. Julia McKenzie returned to the production for the final four performances.\\n\\nThe book \"was extensively reworked by James Goldman, with Sondheim\\'s cooperation and also given an intermission.\" The producer Cameron Mackintosh did not like \"that there was no change in the characters from beginning to end\\xa0... In the London production\\xa0... the characters come to understand each other.\" Sondheim \"did not think the London script was as good as the original.\" However, he thought that it was \"wonderful\" that, at the end of the first act, \"the principal characters recognized their younger selves and were able to acknowledge them throughout the last thirty minutes of the piece.\" Sondheim wrote four new songs: \"Country House\" (replacing \"The Road You Didn\\'t Take\"), \"Loveland\" (replacing the song of the same title), \"Ah, But Underneath\" (replacing \"The Story of Lucy and Jessie\", for the non-dancer Diana Rigg), and \"Make the Most of Your Music\" (replacing \"Live, Laugh, Love\").\\n\\nCritics who had seen the production in New York (such as Frank Rich) found it substantially more \"upbeat\" and lacking in the atmosphere it had originally possessed. According to the Associated Press (AP) reviewer, \"A revised version of the Broadway hit Follies received a standing ovation from its opening-night audience and raves from British critics, who stated the show was worth a 16-year wait.\" The AP quoted Michael Coveney of the Financial Times, who wrote: \"Follies is a great deal more than a camp love-in for old burlesque buffs and Sondheim aficionados.\" In The New York Times, the critic Francis X. Clines wrote: \"The initial critics\\' reviews ranged from unqualified raves to some doubts whether the reworked book of James Goldman is up to the inventiveness of Sondheim\\'s songs. \\'A truly fantastic evening,\\' The Financial Times concluded, while the London Daily News stated \\'The musical is inspired,\\' and The Times described the evening as \\'a wonderful idea for a show which has failed to grow into a story. The Times critic Irving Wardle stated \"It is not much of a story, and whatever possibilities it may have had in theory are scuppered by James Goldman\\'s book\\xa0... a blend of lifeless small-talk, bitching and dreadful gags\". Clines further commented: \"In part, the show is a tribute to musical stage history, in which the 57-year-old Mr Sondheim is steeped, for he first learned song writing at the knee of Oscar Hammerstein II and became the acknowledged master songwriter who bridged past musical stage romance into the modern musical era of irony and neurosis. Follies is a blend of both, and the new production is rounded out with production numbers celebrating love\\'s simple hope for young lovers, its extravagant fantasies for Ziegfeld aficionados, and its fresh lesson for the graying principals.\"\\n\\nThis production was also recorded on two CDs and was the first full recording.\\n\\nFollies was voted ninth in a BBC Radio 2 listener poll of the UK\\'s \"Nation\\'s Number One Essential Musicals\".\\n\\nU.S. regional productions\\nMichigan Opera Theatre (MOT) was the first major American opera company to present Follies as part of their main stage repertoire, running from October 21, 1988, through November 6. The MOT production starred Nancy Dussault (Sally), John-Charles Kelly (Buddy), Juliet Prowse (Phyllis) and Ron Raines (Ben), Edie Adams (Carlotta), Thelma Lee (Hattie), and Dennis Grimaldi (Vincent).\\n\\nA production also ran from March to April 1995 at the Theatre Under the Stars, Houston, Texas, and in April to May 1995 at the 5th Avenue Theatre, Seattle with Constance Towers (Phyllis), Judy Kaye (Sally), Edie Adams, Denise Darcel, Virginia Mayo and Karen Morrow (Carlotta).  The 1998 Paper Mill Playhouse production (Millburn, New Jersey) was directed by Robert Johanson with choreography by Jerry Mitchell and starred Donna McKechnie (Sally), Dee Hoty (Phyllis), Laurence Guittard (Ben), Tony Roberts (Buddy), Kaye Ballard (Hattie ), Eddie Bracken (Weismann), and Ann Miller (Carlotta). Phyllis Newman and Liliane Montevecchi reprised the roles they played in the Lincoln Center production. \"Ah, but Underneath\" was substituted for \"The Story of Lucy and Jessie\" in order to accommodate non-dancer Hoty. This production received a full-length recording on two CDs, including not only the entire score as originally written but a lengthy appendix of songs cut from the original production in tryouts.\\n\\nJulianne Boyd directed a fully staged version of Follies in 2005 by the Barrington Stage Company (Massachusetts) in June–July 2005. The principal cast included Kim Crosby (Sally), Leslie Denniston (Phyllis), Jeff McCarthy (Ben), Lara Teeter (Buddy), Joy Franz (Solange), Marni Nixon (Heidi), and Donna McKechnie (Carlotta). Stephen Sondheim attended one of the performances.\\n\\n1996 and 1998 concerts\\nDublin concert\\nThe Dublin Concert was held in May 1996 at the National Concert Hall. Directed by Michael Scott, the cast included Lorna Luft, Millicent Martin, Mary Millar, Dave Willetts, Trevor Jones Bryan Smyth, Alex Sharpe, Christine Scarry, Aidan Conway and Enda Markey.\\n\\nLondon concert\\nA concert was held at Theatre Royal, Drury Lane, London, on December 8, 1996, and broadcast on BBC Radio 2 on February 15, 1997. The cast starred Julia McKenzie (Sally), Donna McKechnie (Phyllis), Denis Quilley (Ben) and Ron Moody (Buddy). This show recreated the original Broadway score.\\n\\nSydney concert\\nFollies was performed in concert at the Sydney Opera House with the Sydney Symphony Orchestra in February 1998 as the highlight of the Sydney Gay and Lesbian Mardi Gras and had three performances. It was directed and staged by Stephen Lloyd Helper and produced by Helper and Alistair Thomson for Mardi Gras. It starred Toni Lamond (Sally), Jill Perryman(Carlotta), Judi Connelli (Phyllis), Terence Donovan (Ben), Nancye Hayes (Hattie), Glenn Butcher (Buddy), Ron Haddrick (Dimitri), Susan Johnston (Heidi), and Leonie Page, Maree Johnson, Mitchell Butel, Maureen Howard. The Sydney Symphony was conducted by Maestro Tommy Tycho. It followed a similar presentation at the 1995 Melbourne Festival of Arts with a different cast and orchestra.\\n\\n2001 Broadway revival\\nA Broadway revival opened at the Belasco Theatre on April 5, 2001, and closed on July 14, 2001, after 117 performances and 32 previews. This Roundabout Theatre limited engagement had been expected to close on September 30, 2001. Directed by Matthew Warchus with choreography by Kathleen Marshall, it starred Blythe Danner (Phyllis), Judith Ivey (Sally), Treat Williams (Buddy), Gregory Harrison (Ben), Marge Champion, Polly Bergen (Carlotta), Joan Roberts (Laurey from the original Broadway production of Oklahoma!; later replaced by Marni Nixon), Larry Raiken (Roscoe) and an assortment of famous names from the past. Former MGM and onetime Broadway star Betty Garrett, best known to younger audiences for her television work, played Hattie. It was significantly stripped down (earlier productions had featured extravagant sets and costumes) and was not a success critically.\\n\\nAccording to an article in The Hollywood Reporter, \"almost every performance of the show played to a full house, more often than not to standing-room-only. Tickets always were tough to come by. The reason the final curtain came down Saturday was that being a production by the Roundabout Theatre Company – a subscription-based \\'not-for-profit\\' theater company – it was presented under special Equity terms, with its actors paid a minimal fee. To extend the show, it would have been necessary to negotiate new contracts with the entire company\\xa0... because of the Belasco\\'s limited seating, it wasn\\'t deemed financially feasible to do so.\"\\n\\nTheater writer and historian John Kenrick wrote \"the bad news is that this Follies is a dramatic and conceptual failure. The good news is that it also features some of the most exciting musical moments Broadway has seen in several seasons. Since you don\\'t get those moments from the production, the book or the leads, that leaves the featured ensemble, and in Follies that amounts to a small army\\xa0... Marge Champion and Donald Saddler are endearing as the old hoofers\\xa0... I dare you not to fall in love with Betty Garrett\\'s understated \"Broadway Baby\" – you just want to pick her up and hug her. Polly Bergen stops everything cold with \"I\\'m Still Here\", bringing a rare degree of introspection to a song that is too often a mere belt-fest\\xa0... [T]he emotional highpoint comes when Joan Roberts sings \\'One More Kiss\\'.\"\\n\\n2002 London revival\\nA production was mounted at London\\'s Royal Festival Hall in a limited engagement. After previews from August 3, 2002, it opened officially on August 6, and closed on August 31, 2002. Paul Kerryson directed, and the cast starred David Durham as Ben, Kathryn Evans as Sally, Louise Gold as Phyllis, Julia Goss as Heidi and Henry Goodman as Buddy. Variety singer and performer Joan Savage sang \"Broadway Baby\". This production conducted by Julian Kelly featured the original Broadway score.\\n\\n2002 Los Angeles\\nFollies was part of L.A.\\'s Reprise series, and it was housed at the Wadsworth Theatre, presented as a staged concert, running from June 15 to 23, 2002. The production was directed by Arthur Allan Seidelman, set design by Ray Klausen, lighting design by Tom Ruzika, costumes by Randy Gardell, sound design by Philip G. Allen, choreography by Kay Cole, musical director Gerald Sternbach.\\n\\nThe production starred Bob Gunton (Ben), Warren Berlinger (Dimitri Weismann), Patty Duke (Phyllis), Vikki Carr (Sally), Harry Groener (Buddy), Carole Cook (Hattie), Carol Lawrence (Vanessa), Ken Page (Roscoe), Liz Torres (Stella), Amanda McBroom (Solange), Grover Dale (Vincent), Donna McKechnie (Carlotta), Carole Swarbrick (Christine), Stella Stevens (Dee Dee), Mary Jo Catlett (Emily), Justine Johnston (Heidi), Jean Louisa Kelly (Young Sally), Austin Miller (Young Buddy), Tia Riebling (Young Phyllis), Kevin Earley (Young Ben), Abby Feldman (Young Stella), Barbara Chiofalo (Young Heidi), Trevor Brackney (Young Vincent), Melissa Driscoll (Young Vanessa), Stephen Reed (Kevin), and Billy Barnes (Theodore). Hal Linden originally was going to play Ben, but left because he was cast in the Broadway revival of Cabaret as Herr Schultz. Tom Bosley originally was cast as Dimitri Weismann.\\n\\n2003 Ann Arbor\\nA concert production at the Michigan Theater in January 2003 reunited the four principal young ghosts of the original Broadway cast: Kurt Peterson, Harvey Evans, Virginia Sandifur, and Marti Rolph. Having originated the young ghosts over 30 years prior, the actors portrayed the older versions of their Broadway roles. Donna McKechnie enjoyed top billing as Carlotta.\\n\\n2007 New York City Center Encores!\\nNew York City Center\\'s Encores! \"Great American Musicals in Concert\" series featured Follies as its 40th production for six performances in February 2007 in a sold out semi-staged concert. The cast starred Donna Murphy (Phyllis), Victoria Clark (Sally), Victor Garber (Ben) and Michael McGrath (Buddy). Christine Baranski played Carlotta, and Lucine Amara sang Heidi. The cast included Anne Rogers, Jo Anne Worley and Philip Bosco. The director and choreographer was Casey Nicholaw. This production used the original text, and the \"Loveland\" lyrics performed in the 1987 London production.\\n\\n2011 Kennedy Center and Broadway\\nThe Kennedy Center for the Performing Arts production at the Eisenhower Theater started previews on May 7, 2011, with an official opening on May 21, and closed on June 19, 2011. The cast starred Bernadette Peters as Sally, Jan Maxwell as Phyllis, Elaine Paige as Carlotta, Linda Lavin as Hattie, Ron Raines as Ben and Danny Burstein as Buddy. The production was directed by Eric Schaeffer, with choreography by Warren Carlyle, costumes by Gregg Barnes, set by Derek McLane and lighting by Natasha Katz. Also featured were Rosalind Elias as Heidi, Régine as Solange, Susan Watson as Emily, and Terri White as Stella. The budget was reported to be $7.3 million. The production played to 95% capacity.\\n\\nReviews were mixed, with Ben Brantley of The New York Times writing \"It wasn\\'t until the second act that I fell in love all over again with Follies\". Peter Marks of The Washington Post wrote that the revival \"takes an audience halfway to paradise.\" He praised a \"broodingly luminous Jan Maxwell\" and Burstein\\'s \"hapless onetime stage-door Johnny\", as well as \"the show\\'s final 20 minutes, when we ascend with the main characters into an ironic vaudeville dreamscape of assorted neuroses - the most intoxicating articulation of the musical\\'s \\'Loveland\\' sequence that I\\'ve ever seen.\" Variety gave a very favorable review to the \"lavish and entirely satisfying production\", saying that Schaeffer directs \"in methodical fashion, building progressively to a crescendo exactly as Sondheim does with so many of his stirring melodies. Several show-stopping routines are provided by choreographer Warren Carlyle.\" Terry Teachout of the Wall Street Journal noted that \"One of the signal achievements of this Follies is that it succeeds in untangling each and every strand of the show\\'s knotty plot\\xa0... Mr. Schaeffer is clearly unafraid of the darkness of Follies, so much so that the first act is bitter enough to sting. Yet he and Warren Carlyle\\xa0... just as clearly revel in the richness of the knowing pastiche songs with which Mr. Sondheim evokes the popular music of the prerock era.\"\\n\\nThe production transferred to Broadway at the Marquis Theatre in a limited engagement starting previews on August 7, 2011, with the official opening on September 12, and closing on January 22, 2012, after 151 performances and 38 previews. The four principal performers reprised their roles, as well as Paige as Carlotta. Jayne Houdyshell as Hattie, Mary Beth Peil as Solange LaFitte, and Don Correia as Theodore joined the Broadway cast. A two-disc cast album of this production was recorded by PS Classics and was released on November 29, 2011.\\n\\nBrantley reviewed the Broadway revival for The New York Times, writing: \"Somewhere along the road from Washington to Broadway, the Kennedy Center production of Follies picked up a pulse\\xa0... I am happy to report that since then, Ms Peters has connected with her inner frump, Mr. Raines has found the brittle skeleton within his solid flesh, and Ms. Maxwell and Mr. Burstein have only improved. Two new additions to the cast, Jayne Houdyshell and Mary Beth Peil, are terrific. This production has taken on the glint of crystalline sharpness.\" The production\\'s run was extended, and its grosses exceeded expectations, but it did not recoup its investment.\\n\\nThe Broadway production won the Drama League Award, Distinguished Production of a Musical Revival for 2011-2012 and the Drama Desk Award for Outstanding Revival of a Musical, Outstanding Actor in a Musical (Burstein) and Outstanding Costume Design (Barnes). Out of seven Tony Award nominations, including Best Revival of a Musical, it won only one, for Barnes\\' costumes.\\n\\n2012 Los Angeles\\nThe 2011 Broadway and Kennedy Center production transferred to the Ahmanson Theatre, Los Angeles, California, in a limited engagement, from May 3, 2012, through June 9. The majority of the Broadway cast reprised their roles, with the exception of Bernadette Peters, who had prior concert commitments and was replaced by Victoria Clark in the role of Sally, a role she has previously played in New York. Other new cast members included Carol Neblett as Heidi, Sammy Williams as Theodore and Obba Babatunde as Max.\\n\\n2013 Toulon Opera House (France)\\nFor its first production in France, Follies was presented at the Toulon Opera House in March 2013. This English-language production, using the full original orchestration, was directed by Olivier Bénézech and conducted by David Charles Abell. The cast featured Charlotte Page (Sally), Liz Robertson (Phyllis), Graham Bickley (Ben), Jérôme Pradon (Buddy), Nicole Croisille (Carlotta), Julia Sutton (Hattie) and Fra Fee (Young Buddy).\\n\\n2016 Australian concert version \\nA concert version at the Melbourne Recital Centre, staged with a full 23-piece orchestra and Australian actors Philip Quast (Ben), David Hobson (Buddy), Lisa McCune (Sally), Anne Wood (Phyllis), Rowan Witt (Young Buddy), Sophie Wright (Young Sally), Nancy Hayes (Hattie), Debra Byrne (Carlotta), and Queenie van de Zandt (Stella). The production was directed by Tyran Parke and produced by StoreyBoard Entertainment.\\n\\n2017 London revival \\nA London revival was performed in the Olivier Theatre at the National Theatre (August 22 until November 4, 2017 - later extended to January 3, 2018, as extensions are common practice at the National Theatre). The production was directed by Dominic Cooke, choreographed by Bill Deamer and starred Peter Forbes as Buddy, Imelda Staunton as Sally, Janie Dee as Phyllis, Philip Quast as Ben and Tracie Bennett as Carlotta. This production notably goes back to the original plan of a one-act performance. The production was broadcast live to cinemas worldwide on November 16 through the National Theatre Live program.\\n\\nThe production returned to the Olivier Theatre on February 14, 2019, playing until May 11. Janie Dee and Peter Forbes returned as Phyllis and Buddy, while Joanna Riding and Alexander Hanson replaced Staunton and Quast as Sally and Ben. Bennett also reprised her Olivier-nominated performance. A recording of the National Theatre production was released on January 18, 2019.\\n\\nThe 2017 production was nominated for 10 Laurence Olivier Awards and won 2 for Best Musical Revival and Best Costume Design (by Vicki Mortimer).\\n\\nCharacters and original cast\\n\\nThe characters and original cast:\\n\\nCritical response\\nIn the foreword to \"Everything Was Possible\", Frank Rich wrote: \"From the start, critics have been divided about Follies, passionately pro or con but rarely on the fence\\xa0... Is it really a great musical, or merely the greatest of all cult musicals?\" (Chapin, p. xi) Ted Chapin wrote, \"Taken as a whole, the collection of reviews Follies received was as rangy as possible.\" (Chapin, p.\\xa0300) In his The New York Times review of the original Broadway production, Clive Barnes wrote: \"it is stylish, innovative, it has some of the best lyrics I have ever encountered, and above all it is a serious attempt to deal with the musical form.\" Barnes also called the story shallow and Sondheim\\'s words a joy \"even when his music sends shivers of indifference up your spine.\"\\n\\nWalter Kerr wrote in The New York Times about the original production: \"Follies is intermissionless and exhausting, an extravaganza that becomes so tedious\\xa0... because its extravaganzas have nothing to do with its pebble of a plot.\" On the other hand, Martin Gottfried wrote: \"Follies is truly awesome and, if it is not consistently good, it is always great.\"\\n\\nTime magazine wrote about the original Broadway production: \"At its worst moments, Follies is mannered and pretentious, overreaching for Significance. At its best moments—and there are many—it is the most imaginative and original new musical that Broadway has seen in years.\"\\n\\nFrank Rich, in reviewing the 1985 concert in The New York Times, wrote: \"Friday\\'s performance made the case that this Broadway musical\\xa0... can take its place among our musical theater\\'s very finest achievements.\" Ben Brantley, reviewing the 1998 Paper Mill Playhouse production in The New York Times, concluded that it was a \"fine, heartfelt production, which confirms Follies as a landmark musical and a work of art\\xa0...\".\\n\\nThe Time reviewer wrote of the 2001 Broadway revival: \"Even in its more modest incarnation, Follies has, no question, the best score on Broadway.\" He noted, though, that \"I\\'m sorry the cast was reduced from 52 to 38, the orchestra from 26 players to 14\\xa0... To appreciate the revival, you must buy into James Goldman\\'s book, which is peddling a panoramically bleak take on marriage.\" Finally, he wrote: \"But Follies never makes fun of the honorable musical tradition to which it belongs. The show and the score have a double vision: simultaneously squinting at the messes people make of their lives and wide-eyed at the lingering grace and lift of the music they want to hear. Sondheim\\'s songs aren\\'t parodies or deconstructions; they are evocations that recognize the power of a love song. In 1971 or 2001, Follies validates the legend that a Broadway show can be an event worth dressing up for.\"\\n\\nBrantley, reviewing the 2007 Encores! concert for The New York Times, wrote: \"I have never felt the splendid sadness of Follies as acutely as I did watching the emotionally transparent concert production\\xa0... At almost any moment, to look at the faces of any of the principal performers\\xa0... is to be aware of people both bewitched and wounded by the contemplation of who they used to be. When they sing, in voices layered with ambivalence and anger and longing, it is clear that it is their past selves whom they are serenading.\"\\n\\nRecordings\\nThere have been six recordings of Follies released: the original 1971 Broadway cast album; Follies in Concert, Avery Fisher Hall (1985); the original London production (1987); the Paper Mill Playhouse (1998); the 2011 Broadway revival; and the 2017 London revival. The original cast album has always been controversial, because significant portions of the score were cut to fit onto one LP. However, as Kritzerland Records head Bruce Kimmel wrote in his liner notes to Kritzerland\\'s remixed version of the album, \"What it did have made it something that, despite the frustrations, meant it would never be bettered – the original cast.\"\\nThe cast recording of the 2011 Broadway revival, by PS Classics, was released officially on November 29, 2011, and was in pre-sale before the store release. PS Classics co-founder Tommy Krasker stated \"We\\'ve never had the kind of reaction that we\\'ve had for Follies. Not only has it already outsold every other album at our website, but the steady stream of emails from customers has been amazing.\" This recording includes \"extended segments of the show\\'s dialogue\". The theatermania.com reviewer wrote that \"The result is an album that, more so than any of the other existing recordings, allows listeners to re-experience the heartbreaking collision of past and present that\\'s at the core of the piece.\" The recording of the 2011 revival was nominated for a Grammy Award in the Musical Theater Album category. The 2017 London revival cast was recorded after the production closed in January 2018, and was released in early 2019.\\n\\nFilm adaptation\\nIn January 2015, it was reported that Rob Marshall is set to direct the movie, and Meryl Streep was rumored to star in it. Tony Award-winning playwright and Oscar-nominated screenwriter John Logan has expressed interest in writing a film adaptation of Follies.\\n\\nIn November 2019, it was announced that Dominic Cooke will adapt the screenplay and direct the film, after having directed the successful 2017 revival in the National Theatre in London, which returned in 2019 because of popular demand.\\n\\nAwards and nominations\\n\\nOriginal Broadway production\\n\\nOriginal London production\\n\\n2001 Broadway revival\\n\\n2011 Broadway revival\\n\\n2017 London revival\\n\\nNotes\\n\\nReferences\\n Chapin, Ted (2003). Everything Was Possible: The Birth of the Musical Follies. New York, New York: Alfred A. Knopf. \\n Secrest, Meryle (1998). Stephen Sondheim: A Life. Dell Publishing, Alfred A. Knopf (reprint). \\n Sondheim, Stephen and Goldman, James (2001). Follies. New York, New York: Theatre Communications Group. \\nSondheim, Stephen (2010). Finishing the Hat. Alfred A. Knopf.\\n\\nFurther reading\\n Prince, Harold (1974). Contradictions: Notes on Twenty-six Years in the Theatre. Dodd, Mead. \\n Ilson, Carol (2004). Harold Prince: A Director\\'s Journey, Limelight Editions. \\n Mandelbaum, Ken (1990). A Chorus Line and the Musicals of Michael Bennett. St. Martins Press.\\n\\nExternal links\\n \\n Follies on The Stephen Sondheim Reference Guide\\n \\n Follies at the Music Theatre International website\\n\\n1971 musicals\\nBroadway musicals\\nLaurence Olivier Award-winning musicals\\nOriginal musicals\\nMusicals by James Goldman\\nMusicals by Stephen Sondheim\\nWest End musicals\\nPlays set in New York City\\nTony Award-winning musicals\\nBackstage musicals'},\n",
+       "  {'docid': 'doc-en-11',\n",
+       "   'text': 'Cleopatra in Space is an American animated television series produced by DreamWorks Animation and animated by Titmouse, Inc., based on the graphic novel series of the same name by Mike Maihack. The showrunners for the series are Doug Langdale and Fitzy Fitzmaurice.\\n\\nIn the United States, the first five episodes were released on NBCUniversal\\'s streaming service Peacock for Xfinity customers on April 15, 2020, making this the first DreamWorks Animation series to be released on a streaming device other than Netflix or Amazon Video. On July 15, 2020, the first season was officially released when the service launched nationwide. Prior to its release in the United States, the series was first broadcast in Southeast Asia on DreamWorks Channel beginning on November 25, 2019. The show is geared toward those between ages 6 and 11. Langdale, in an interview, said that he is attempting to make sure the show is \"accessible to a younger audience,\" even as he doesn\\'t give much thought to what age demographic the show is aiming towards.\\n\\nOn July 15, the show premiered on Peacock, with episodes 1–5 and 7–13 of the first season made available to viewers who subscribed to \"Peacock Premium\", and a more limited selection for those who chose a free plan. It was one of the three animated Peacock Originals streaming on the platform, with the other two being season 13 of Curious George and season 2 of Where\\'s Waldo?. The show can only be watched using the streaming service\\'s premium plan. On November 19, 2020, Season 2 premiered on Peacock. On January 14, 2021, Season 3 was released on Peacock. On July 14, 2021, all three seasons were added to Hulu.\\n\\nPlot\\nCleopatra in Space is a comedic adventure focusing on Cleopatra\\'s teenage years, as she deals with the ups and downs of being a high school teenager, after she transported 30,000 years into her future to a planet with Egyptian themes ruled by talking cats, and she is said to be the savior of a galaxy. Cleopatra and her newfound friends work to try and return her to her own time, in Ancient Egypt, as she gains new combat skills in the process. Showrunner Doug Langdale described the show as a \"real move-forward story\" which continues forward without interruption.\\n\\nCharacters\\n\\nMain\\n Cleopatra \"Cleo\" (voiced by Lilimar Hernandez) - The fearless and confident protagonist of the series. The 15-year-old princess of ancient Egypt, whose father is Pharaoh King Ptolemy (Sendhil Ramamurthy), she ends up sucked into a portal that sends her 30,000 years into the future where she learns she is the prophesied \"Savior of the Nile Galaxy\", destined to defeat the evil space tyrant Octavian. She ends up attending the futuristic intergalactic academy named P.Y.R.A.M.I.D. to obtain the proper training and skills to fulfill her role. She is sometimes reckless and impulsive, but has a good heart and wants peace. She has also gained strange and powerful powers from her time-travel, which manifests in pink and can be used to drain energy and project it into energy waves and beams. Lilimar called Cleo a character who is not completely mature or responsible, but a young girl who is on the road to becoming a hero, a person who is courageous and brave, seeing \"a lot of positivity in the world, no matter how dark things seem to be,\" even as she seeks adventure all the time.\\n Akila (voiced by Katie Crown) - A pink-eyed fish girl from another planet and the first of Cleopatra\\'s teammates. She is very friendly and optimistic, but over-enthusiastic. She may have a crush on Brian. She has two moms: Theoda (voiced by Cissy Jones) and Pothina (voiced by Kari Wahlgren), who are scholars at The Savior Institute, use dated social expressions and love their daughter. They are the first confirmed LGBTQ characters in the series.\\n Brian (voiced by Jorge Diaz) - A cyborg teenage boy, and Cleopatra\\'s second teammate. His body is mostly robotic and is also sensitive of the fact that he was transformed into a cyborg. He is rather worrisome, paranoid, nervous, and self-conscious at times. He has a crush on Akila. He and Akila later have a foster child of sorts named Eyeball, who is voiced by Brian Posehn.\\n Khensu (voiced by Sendhil Ramamurthy) - An intelligent, long-eared cat with the ability to speak. He serves as the leader of the group, and a teacher at P.Y.R.A.M.I.D. He becomes, as noted by showrunner Doug Langdale, like a surrogate father to Cleo, balancing out her action and acrobatics with his intellectual, sensible, and down-to-Earth nature.\\n Mihos - The cute and lovable pet of Cleo. Later Doctor Queed says that Mihos can\\'t be native to the planet but rather is a \"lost pet.\" Boop, a female pirate and space squirrel, is his doppelgänger. Lilimar, in an interview, said that while she likes Brian and Akila as characters, Mihos is her favorite, even making a request to make him into a plushie.\\n\\nSupporting\\n Callie (voiced by Kari Wahlgren) - An arrogant and snobby student at P.Y.R.A.M.I.D. who becomes Cleopatra\\'s academic rival upon her arrival in the school. \\n Xerxs (voiced by Dee Bradley Baker) - Legions of alien robots who are the soldiers and servants of Octavian.\\n Zaid Antonius (voiced by Xolo Maridueña) - A student at P.Y.R.A.M.I.D., whom Cleopatra is attracted to. He is later revealed to be a spy of Octavian, due to the latter holding his parents, Dahab (Candi Milo) and Askari (Dee Bradley Baker), hostages. His family name is a hint towards the historical figure Marcus Antonius.\\n Administrant Khepra (voiced by Sumalee Montano) - The head of the cat-council.\\n Octavian (voiced by Jonathan Kite) - The main antagonist of the series. The evil ruler of the Xerxs who has destroyed or enslaved many worlds and Cleopatra\\'s arch-nemesis. He is bent on capturing and getting rid of Cleopatra so the prophecy about her defeating him cannot be fulfilled. Named after the historical figure Gaius Octavian.\\n E\\'Geke-Ek\\'Gek (voiced by Alex Cazares) - an alien student who requires a translator torque to communicate.\\n Yosira (voiced by Wahlgren) - The young Pharaoh of P.Y.R.A.M.I.D. She is the granddaughter of the founder of P.Y.R.A.M.I.D., the late Pharaoh Yosiro. \\n Zuzz (voiced by Zach Callison) - A student whose body consists of a swarm of insect-shaped \"bits\".\\n Omnia (voiced by Elise Dubois) - A robot representing a planet and part of the debate club at P.Y.R.A.M.I.D.\\n\\nOther characters\\n Professor Sitre (voiced by Amy Hill) - a middle-age cat teacher at P.Y.R.A.M.I.D.\\n Philo (voiced by Gunnar Sizemore) - A young apprentice at P.Y.R.A.M.I.D., whom Cleopatra mentors in one episode in order to attain the cadets\\' Level 2.\\n Zedge (voiced by Lucas Grabeel) - A popular intergalactic rockstar, whom Akila has a crush on and claims to be his \"biggest fan\". He was briefly mind-controlled by Octavian to capture Cleopatra. \\n Cyrano (voiced by Greg Cipes) - An evil artificial intelligence created by Octavius to counter Brian. \\n Gozi (voiced by Karan Brar) - A young Egyptian boy who was Cleopatra\\'s best friend back on Earth in her own time. \\n Msamaki (voiced by David Shaughnessy) - An intelligent, long-eared cat with the ability to speak, and a member of the Cat council.\\n Professor Klabrax V (voiced by Dawnn Lewis) - An intelligent, fish-like being who is a professor at P.Y.R.A.M.I.D.\\n Dr. Queed (voiced by Paul Rugg) - A former doctor at P.Y.R.A.M.I.D. and an acquaintance of Khensu whose catchphrase is \"uncanny scientific brilliance!\"\\n Debbie (voiced by Candi Milo) - A lonely planet who can manifest into various forms and later a student at P.Y.R.A.M.I.D.\\n Generator A.I. (voiced by Dee Bradley Baker) - An A.I. located nearby the academy which is used to generate electricity for the campus after Cleo sucks out all the power from the academy.\\n Damaris (voiced by Marieve Herington) - A space scavenger, part of a group led by Dave.\\n Dave (voiced by Ace Gibson) - Leader of the space scavengers and has a pet named Precious.\\n Simon (voiced by Rhys Darby) - A conniving snake who turns on Akila, with Cleo and her parents working together to stop it.\\n Gurbo Gorbage (voiced by Kay Bess) - A purported television personality who kidnaps Cleo and almost sends her to Octavian.\\n Amsaja (voiced by Kimberly D. Brooks) - The self-declared \"Queen of the Space Pirates,\" who heading a crew of three other pirates, and Cleo\\'s doppelgänger. She previously had the telepathic space shark ninja as her ex-boyfriend, and Octavian might be her ex-boyfriend as well.\\n Cyborg Dwayne - (voiced by Andrew Morgado) - The doppelgänger of Brian who is also on the pirate ship.\\n Medjed (voiced by Ken Pringle) - Ruler of Dargham who tries to convince Cleo and Zaid to stay there indefinitely.\\n Gled (voiced by John DiMaggio) - Chief of the Tawrisians on the planet Tawris in the Nile Galaxy. \\n Commodore Winifred (voiced by Toks Olagundoye) - The commander of a ship of Parvites atop the head of Mihos, whose full name is Commodore Winifred Blurvington the Third.\\n Mortimer (voiced by Damian O\\'Hare) - A lieutenant who serves under Commodore Winifred and resembles a proper British gentleman.\\n\\nProduction\\nIn January 2018, DreamWorks Animation filed trademark applications for the show to the United States Patent and Trademark Office. Since then, DreamWorks has filed for four extensions on their trademark for Cleopatra in Space, two times in 2019, and two times in 2020, all of which were granted.\\n\\nMike Maihack said that the series is in retroactive continuity to his comics because Cleo is a teenager and there is time travel. This differs from his comic book series which is \"rooted from stories and research of the actual Cleopatra.\" He also may have been influenced by Kipo and the Age of Wonderbeasts, Avatar: The Last Airbender, Star Trek: The Next Generation, Buffy the Vampire Slayer, and Legends of Tomorrow. In an interview with Charles C. Dowd on I Heart Webcomics on July 22, Maihack said that he consulted in the early stages of the show, letting DreamWorks know the upcoming details of his book and remained supportive, admitting he did not want \"a direct adaption of the novels.\" He further said that he saw the animated series as an opportunity to work on the Cleo in Space concept in another way who had worked on Ben 10: Omniverse and DuckTales, noting that while it was clear that those working on the series understood \"the core components of the story,\" he stepped back, letting the \"amazingly talented folks\" involved in the show do their work.\\n\\nIn an interview with Jackson Murphy of Animation Scoop, showrunner Doug Langdale said the story lends itself to \"serialized storytelling\" rather than an animated feature film, and that developing the show has been a \"pretty involved process.\" He also stated that the different uniforms at the Academy are \"different colors for different divisions in the school and emblems,\" that they used common science-fiction tropes, that the show is not lesson-based but is just entertainment, and that Mike Maihack was ok with deviating from the original graphic novels, so they could create something that fans would enjoy. Langdale expanded on this in an interview with Screen Rant where he noted that they found Maihack\\'s books, noted that DreamWorks had been trying to create a feature film about it, which was abandoned so they could do a series. He further described the differences between the books and the series, which are on a \"day-to-day basis,\" with the series not following the books closely at all, even as they used the \"same set up, [and] many of the same characters.\" Langdale explained how Egyptian history was an inspiration for many character names, sometimes by coincidence, and how Lilimar Herendez was the choice for the main role of Cleopatra from the beginning, while stating how Sendhil Ramamurthy, Jorge Diaz, and Katie Crown influenced the show through their voice acting. He finally told the interview that the show ended up with a \"predominantly female cast,\" with DreamWorks seeing this as a \"good time to make a show with a female lead,\" and explained that the \"first 12 episodes take place within a few months.\"\\n\\nIn an interview with CBR, Langdale said that he enjoyed Maihack\\'s books, and agreed with DreamWorks to create the show, using Maihack\\'s characters as a starting point, but then \"went off in some different directions.\" He again reiterated that the episodes track the characters on a day-to-day basis, and differed from the original books because Brian and Akila are humans, rather than a cyborg and an alien, with the voice actors shaping their characters. At the same time, he sidestepped the historical debate over her origins.\\n\\nIn an August 6, 2020 interview with ComicBook, Langdale further explained the development of the show. He said that they didn\\'t \"literally translate the books,\" but took the characters Mike Maihack created, some character bits, and did their \"our own thing.\" He added that they wanted to show a \"day-to-day story about the characters,\" different from the books, tried to work in some \"some ancient Egyptian motifs\" and said some inspiration could have drawn from 1970s French comic books, and three divisions in the school: \"command, combat, and science.\" He stated he didn\\'t think about the age range DreamWorks gave him for the show, rather aiming to make an enjoyable show which is \"pretty serialized.\"\\n\\nMusic\\nThe music of the series is composed by Jay Vincent and Ryan Lofty. It was described by Courtney Lofty, the score production manager, as \"an epic cocktail of electronic beats, Egyptian melodies, and orchestral dramatics,\" with other melodies, with \"an extreme amount of time\" researching for the music, which references Paramore, M.I.A., and the score of The Prince of Egypt. The music was attuned to the specific scenes in each episode.\\n\\nThe series opening theme song was sung by Lilimar Hernandez, the voice actress for Cleopatra. Additionally, Matt Barrios worked on the main title. Jackson Murphy of Animation Scoop described the song as making it clear that Cleo\\'s story is about \"meaning, purpose and destiny.\" In an August 13, 2020 interview with Sergio Burstein of the Los Angeles Times, Lilimar describes the show as the first thing she has done in the animation field, and was surprised to by their proposal for her to sing the title song. While she admitted she was nervous to sing the song at first, because she hadn\\'t \"done anything with music in ten years,\" she said that working on the show helped her regain her \"voice as a singer,\" while encouraging her to do things out of her \"comfort zone,\" and remained grateful of the positive responses on Twitter. She said something similar in an interview with a Spanish language publication, Siempre Mujer and an interview with a Spanish-language YouTuber, saying the project surprised her because she never expected to sing the theme song and that working on the show was a learning experience. On August 21, in an interview with Alfonso Díaz of RCN Nuestra Tele Internacional, Lilimar called working on the show a \"very nice experience,\" noted that the show was her second job for DreamWorks (her first was Spirit Riding Free), and how she sometimes recorded the lines for the show alone, while other times she did it with the rest of the cast. She also explained the struggles with recording lines, how this show is the first time she has had such a big role, and its relevance during the COVID-19 pandemic with the main cast having to work together under extraordinary circumstances.\\n\\nIn an interview with CBR in January 2021, she said that singing the opening theme was nowhere in her contract and they called her saying they\\'d like her to sing the song. She decided to do so, even though she was \"not in a confident place\" with her singing, and didn\\'t think much of it. She wasn\\'t told they were going to use it, then they had a premiere for those on the team, and she brought her mom along, who pointed out it was her. She said that the fact that they used her voice meant \"they liked it\" and called it \"really cool\" that she sang the opening.\\n\\nDesign\\nAccording to Langdale, the drawn-out visual development of the show allowed them to have a style close to Mailhack\\'s original graphic novels. He added that the show\\'s crew wanted a visual style which \"was going to be fun to look at,\" with the Academy divided into \"three areas of specialization...identified by color\" which is not directly noted in the show. He further pointed out that one challenge was with the digital 2D animation and they received help from the animators. In a later interview he said that the show\\'s animation sometimes mirrored the scenes in the graphic novels.\\n\\nCharacter design\\nOn September 16, 2020, Bertrand Todesco, the series\\' character designer, was interviewed by VoyageLA. In the interview, he described how he imagines the \"shapes and colors of the characters based on the descriptions\" he reads in a script or other document, saying that in draw something differently depending on the age of the audience, and the excitement of working collaboratively with others on various animated shows. He also talked about developing the show\\'s main characters and that with the help of DreamWorks, and others like Angela Mueller, Art Director for the show, his O-1 visa was approved, allowing him to stay in Los Angeles. According to Langdale, Todesco came up with Akila as a \"fish-based character\" while they went back-and-forth as to whether Brian would be a human or a cyborg, and that they shaped Cleopatra\\'s character by what \"someone with a passing familiarity\" of her might think she was as a person.\\n\\nStorytelling\\nIn early January 2021, Wei Li, who storyboarded eight episodes for the show, shared an animatic from an episode he storyboarded, titled \"My Pharaoh Lady,\" adding that the \"show never took off.\" He later explained what he meant was that the series did not get the \"proper distribution,\" said that he personally thought \"the story could be a lot better,\" and argued the original comic deals with subjects with more seriousness, stated that, \"it seems like they radically changed Cleo\\'s personality\" from the comics. He later explained that personally, if he could, he would change the episodes \"where Cleo supposedly learns from her mistakes,\" so that the viewers see a \"change in her from that lesson in the episodes afterward\".\\n\\nVoice acting\\nIn an exclusive interview with CBR, Lilimar Hernandez said that this was the first show where she had a major role and that she used her \"natural voice\" when voicing Cleopatra. She described trying to found out what that voice was and attempting to be as consistent as possible with that voice. She tried her best to keep her \"fun-loving nature\" and called the voice acting a \"cool journey.\" Furthermore, she said that the team she worked with, like Doug Langdale, and a voice director Sirena Irwin, made her feel excited and comfortable, as she explored the character and the world of the show. In the same interview, she said that the experience was \"nice,\" even as a new person to voice acting, adding that working with people who more skilled in the industry inspired and motivated her. She later said that it was cool \"tapping into the fanbase from the books themselves,\" that she received a lot of \"really, really cool feedback\" and noted that the production schedule was consistent. She described the group sessions as having the highest energy, with everyone having fun \"seeing each other become the characters,\" with none of it seen as draining.\\n\\nEpisodes\\n\\nSeries overview\\n\\nSeason 1 (2020-21)\\n\\nSeason 2 (2020)\\n\\nSeason 3 (2021)\\n\\nRelease\\nIn the United States, NBCUniversal\\'s advertisement sales website previously suggested that Cleopatra in Space would be broadcast on Universal Kids. Later, it emerged in January 2020 that the series would instead be included in the launch line-up of Peacock, NBCUniversal\\'s streaming service on April 15, 2020 to Xfinity customers, and July 15 to all customers nationwide.\\n\\nPrior to the scheduled release in the United States, the series premiered on November 25, 2019 on DreamWorks Channel, which is available in Southeast Asia and select other areas of Asia Pacific (specifically, Hong Kong, Indonesia, South Korea, Malaysia, Maldives, Myanmar, the Philippines, Pakistan, Singapore, and Taiwan). 19 episodes have since aired on the channel. The series also premiered in Poland on Teletoon+ on February 15, 2020 with a Polish dub. The series was also available in South Africa on Showmax, including all episodes in season one by February 23, 2020. By June 9, 2020, all 26 episodes of the show\\'s first season were made available on the Viaplay service in Scandinavia because of an agreement between NBCUniversal and the Nordic Entertainment Group (NENT Group).\\n\\nOn May 1, 2020, the entire first season of Cleopatra in Space was released on Globoplay, a Brazilian service and subsidiary of Grupo Globo, with the name Cleópatra no Espaço.\\n\\nOn July 15, when the show premiered on Peacock to all those in the United States, Mike Maihack praised the show\\'s release and all the hard work put in, giving it his endorsement. At the same time, he called the release of only 12 episodes \"disappointing,\" and lamented the absence of the sixth episode, \"Quarantine,\" which \"deals with a zombie-like flu and the consequences of Cleo avoiding quarantine,\" saying that it is something the whole world should \"be able to see right now.\" Two days later, when a fan asked about the missing sixth episode and the misspelled title of one of the episodes, an official Peacock account responded, saying they had corrected the episode title, but that for episode 6, \"this content is temporarily unavailable on the platform\" and that they appreciated the feedback, saying they \"will pass it along to the proper team.\" Later the same day, the same account said that there was \"no news on that at the moment.\" A few months later, on September 1, another fan asked about the episode, and an official Peacock account stated that the episode is \"not actively on Peacock\" but gave no further explanation as to why that was the case. The episode was eventually released on June 25, 2021.\\n\\nOn August 27, Bertrand Todesco called on Netflix France, and their main office in the United States, to broadcast the show outside the U.S., noting that \"international fans\" are asking about it every day, and that they could negotiate the rights with DreamWorks. The same day, Todesco thanked the fans of the show, saying he had seen \"a lot of incredible\" fan art from \"all over the web.\"\\n\\nIn September 2020, the show began airing on the Disney Channel in Russia with the name Клеопатра в космосе.\\n\\nIn October, in the UK, the show began airing on Sky One as part of a partnership with NBCUniversal. In a tweet, Mike Maihack hoped it was \"good news\" for fans of the show in the United Kingdom who have wanted to watch the show. Currently, the Sky One website allows subscribers to their Sky Go service to watch 25 episodes, but not episode 6, \"Quarantine\". When asked about this, Sky UK stated that this was not included because of the \"licensing on the episode.\"\\n\\nIn November, a new poster for Season 2 was released, as was a summary for the season, saying it would focus on Cleo and her friends \"embarking on a mission to search the galaxy for an ancient artifact that could defeat Octavian for good,\" and a preview video. On November 19, 2020, Season 2, premiered on Peacock.\\n\\nOn January 9, 2021, in Canada, the series started airing on Family CHRGD. Then on January 14, 2021, Season 3 was released on Peacock.\\n\\nOn July 14, 2021, the \"Complete First Season\" of the series, along with the Spanish version, Cleopatra en el Espacio, was released on Hulu, consisting of all three seasons which were released on Peacock.\\n\\nReception\\nEncyclopedia of Science Fiction contributor Steven Pearce gave a short positive review of the show, though was critical about the show\\'s writing and Cleopatra\\'s personality, saying it is \"very much your feisty American teen.\" The entry praises the \"nice background details\" and calls the series fun, amusing, \"brightly animated and engaging.\" Courtney Lofty described the series as being about \"badass women, talking cats, [and] space,\" noting that the overall vibe is a \"classic Saturday morning cartoon, with extremely quotable moments\" which is like Invader Zim. Cheryl Eddy on Gizmodo described the show as one aimed at children, but \"looks like a fun ride for geeky grown-ups\", while Karen Han and Petrana Radulovic in Polygon and Sam Stone on CBR reviewed it positively. Additionally, others described the show as \"a fun take on the original Cleopatra story\" and a \"comedic adventure\" which focuses on Cleopatra\\'s teenage years, where she is transported into the future \"to an Egyptian-themed planet...ruled by talking cats\" while dealing with the pressures of being \"a teenager in high school\" as she tries to fit in even as Octavian tries to kill her. Later, Petrana Radulovic wrote a positive review of the show. She described the series as wacky and vibrant, using its \"zany concept effectively,\" having interesting adventures, and has main characters who have \"typical stock cartoon personalities.\" At the same time, she compared the impulsive and cocky behavior of Cleo to Lance in Voltron: Legendary Defender and Ben in Ben 10. She further contrasted Cleopatra to Korra in The Legend of Korra and Adora in She-Ra and the Princesses of Power in that she is not ready to accept her destiny but will have to \"confront her own laziness,\" remaining as a \"carefree, imperfect heroine\" in the meantime. Radulovic also said that the \"electronic-infused Egyptian melodies of the score\" make it stand out, as do the outfits of the characters, while noting that the show is episodic like Gravity Falls rather than something like She-Ra and the Princesses of Power.\\n\\nThere were a number of other reviews of the show. In an episode of Tooned Up on the Renegade Pop Culture Podcast, one of the guests described the show as having a stellar voice cast, sharp writing, which is \"almost too self aware,\" while saying that they wished that the animation budget \"was a little bit higher.\" The same guest said that the show skews to those \"a little bit younger\" and said that the show takes a \"few episodes to find its stride,\" but once it does that, it is \"one of the easiest shows to binge.\" Another reviewer took a different tack, focusing on themes of libraries in the show, writing in the ALA\\'s I Love Libraries, writing that the library at Cleopatra\\'s futuristic high school contains information saved from the show\\'s villain, who destroyed most of recorded knowledge, and noting that the library\\'s section on Ancient Egypt, would, if in a real library, \"be housed in a library\\'s special collections.\" In contrast, Ashley Moulton on Common Sense Media rated the series 3 out of 5, noting that there is \"a lot of fantasy violence,\" while saying that Cleopatra is a \"fearless female lead,\" with her potential as a role model \"offset by the fact that she can be impulsive, impatient, overconfident, and not so dedicated to her schoolwork,\" adding that there is \"mild language...and flirtation,\" saying that the show isn\\'t educational even though it \"features a historical character.\" Rather it is, in Moulton\\'s view, focused on entertainment \"in the vein of \\'80s Saturday morning cartoons,\" and she describes the series as \"light, fun tween sci-fi\" animation which explores the past and future, while praising the \"interesting alien species, exciting fight scenes, and fun gadgets like robots and hover boards,\" and the world they live in as \"pretty cool.\" Even so, she argued that the characters are flat, while the characters \"gleefully engage in moderately violent fight scenes\" to defeat villains, and calling the characters \"disappointing\".\\n\\nExplanatory notes\\n\\nReferences\\n\\nExternal links\\n\\nDepictions of Cleopatra on television\\n2020 American television series debuts\\n2020s American animated television series\\nAmerican children\\'s animated action television series\\nAmerican children\\'s animated space adventure television series\\nAmerican children\\'s animated comic science fiction television series\\nAmerican children\\'s animated science fantasy television series\\nAmerican flash animated television series\\nTelevision series by DreamWorks Animation\\nTelevision series by Universal Television\\nAnimated television series about teenagers\\nPeacock (streaming service) original programming\\nTelevision shows based on comics\\nPeacock (streaming service) children\\'s programming\\nWorks about Cleopatra'},\n",
+       "  {'docid': 'doc-en-12',\n",
+       "   'text': 'Impression Products, Inc. v. Lexmark International, Inc., 581 U.S. ___ (2017), is a decision of the Supreme Court of the United States on the exhaustion doctrine in patent law in which the Court held that after the sale of a patented item, the patent holder cannot sue for patent infringement relating to further use of that item, even when in violation of a contract with a customer or imported from outside the United States. The case concerned a patent infringement lawsuit brought by Lexmark against Impression Products, Inc., which bought used ink cartridges, refilled them, replaced a microchip on the cartridge to circumvent a digital rights management scheme, and then resold them. Lexmark argued that as they own several patents related to the ink cartridges, Impression Products was violating their patent rights. The U.S. Supreme Court, reversing a 2016 decision of the Federal Circuit, held that the exhaustion doctrine prevented Lexmark\\'s patent infringement lawsuit, although Lexmark could enforce restrictions on use or resale of its contracts with direct purchasers under regular contract law (but not as a patent infringement lawsuit). Besides printer and ink manufacturers, the decision of the case could affect the markets of high tech consumer goods and prescription drugs.\\n\\nBackground\\n\\nFactual setting\\n\\nLexmark International, Inc. makes and sells printers and toner cartridges for its printers. Lexmark owns a number of patents that cover its cartridges and their use. Lexmark sold the cartridges at issue in this case—some in the United States and some abroad.\\n\\nDomestic sales \\n\\nLexmark\\'s domestic sales were in two categories. A \"Regular Cartridge\" is sold at \"list price\" and confers an absolute title and property right on the buyer. A \"Return Program Cartridge\" is sold at a discount of about 20 percent, and is subject to post-sale restrictions: The buyer may not reuse the cartridge after the toner runs out and may not transfer it to anybody else. The first branch of the case turns on the legal status of these post-sale restrictions.\\n\\nLexmark manufactured the toner cartridges with microchips in them, which send signals to the printers indicating toner level. When the amount of toner in a cartridge falls below a certain level, the printer will not operate with that cartridge. Also, the printer will not operate with a Return Program Cartridge that has been refilled by a third party. Thus, Lexmark\\'s technology prevented violation of the post-sale restriction against refilling the Return Program Cartridges. The Regular Cartridges do not have this anti-refill feature and can therefore be refilled and reused (but they cost 20 percent more).\\n\\n\"To circumvent this technological measure,\" however, \"third parties have \\'hacked\\' the Lexmark microchips. They created their own \"unauthorized replacement\" microchips that, when installed in a Return Program cartridge, fool the printer into allowing reuse of that cartridge. Various companies purchase used Return Program Cartridges from the customers who bought them from Lexmark. They replace the microchips with \"unauthorized replacement\" microchips, refill the cartridges with toner, and sell the \"re-manufactured\" cartridges to resellers such as Impression Products for marketing to consumers for use with Lexmark printers. Lexmark had previously argued in Lexmark International, Inc. v. Static Control Components, Inc. that replacing these microchips violated copyright law and the Digital Millennium Copyright Act (DMCA), but both federal and the Supreme Court have ruled against Lexmark, affirming that replacing the microchips is not in violation of copyright.\\n\\nImported cartridges\\n\\nThe second branch of the case involves cartridges that Lexmark sold outside the US. While some of the foreign-sold cartridges were Regular Cartridges and some were Return Program Cartridges, this branch of the case does not involve any distinction among the two types of imported cartridges.\\n\\nTrial court decision\\n\\nThe district court granted Impression\\'s motion to dismiss Lexmark\\'s claim of infringement involving the single-use cartridges Lexmark had first sold in the United States. The district court concluded that the Supreme Court in Quanta Computer, Inc. v. LG Electronics, Inc.  found exhaustion where \"the Supreme Court determined that the agreements [at issue] broadly authorized Intel [the seller] to sell the licensed products without restrictions or conditions.\" The district court said \"that Quanta overruled Mallinckrodt sub silentio,\" and therefore \"those post-sale use restrictions do not prevent patent rights from being exhausted given that the initial sales were authorized and unrestricted.\"\\n\\nThe district court held, however, that the exhaustion doctrine did not apply to the cartridges that Lexmark had sold abroad. It said that international exhaustion did not apply to patents because Kirtsaeng v. John Wiley & Sons, Inc., which established international exhaustion in at least some cases, applied only to copyrights. The court therefore denied Impression\\'s motion to dismiss Lexmark\\'s claim of infringement involving the cartridges Lexmark had sold abroad.\\n\\nGovernment amicus curiae position\\n\\nIn its amicus curiae brief, the US Government argued that Mallinckrodt had been wrongly decided in 1992 and in any case it had been overruled sub silentio in Quanta. It stated:\\nIn the view of the United States, the first authorized sale of a patented article in the United States wholly exhausts the patentee\\'s exclusive rights in that article, notwithstanding any post-sale restriction imposed by the patentee.\\nThe government also argued that the decision of Jazz Photo Corp. v. United States International Trade Commission (2001) should be partially overruled in light of Kirtsaeng insofar as it held that foreign sales can never exhaust US patent rights. When the patentee neither makes nor authorizes a foreign sale, as occurred in Boesch v. Graff, it is proper to say no exhaustion occurred. But when the patentee makes or authorizes a foreign sale, and fails expressly to reserve its US rights, then exhaustion should be found. In the present case, Lexmark made the foreign sales and failed to expressly reserve its US rights; therefore, the sale exhausted the patent rights.\\n\\nFederal Circuit decision\\n\\nThe parties each appealed.  After a three judge panel had heard oral argument the Federal Circuit sua sponte set the case for argument en banc in the first instance and invited the filing of amicus curiae briefs.\\n\\nMajority opinion\\n\\nJudge Taranto, writing for a 10-2 majority, reaffirmed both of the prior Federal Circuit rulings. In summary, the court held:\\n\\nFirst, we adhere to the holding of Mallinckrodt, Inc. v. Medipart, Inc. that a patentee, when selling a patented article subject to a single-use/no-resale restriction that is lawful and clearly communicated to the purchaser, does not by that sale give the buyer, or downstream buyers, the resale/reuse authority that has been expressly denied. Such resale or reuse, when contrary to the known, lawful limits on the authority conferred at the time of the original sale, remains unauthorized and therefore remains infringing conduct under the terms of §\\xa0271. Under Supreme Court precedent, a patentee may preserve its §\\xa0271 rights through such restrictions when licensing others to make and sell patented articles; Mallinckrodt held that there is no sound legal basis for denying the same ability to the patentee that makes and sells the articles itself. We find Mallinckrodt\\'s principle to remain sound after the Supreme Court\\'s decision in Quanta Computer, Inc. v. LG Electronics, Inc.\\xa0.\\xa0.\\xa0.\\n\\nSecond, we adhere to the holding of Jazz Photo Corp. v. International Trade Comm\\'n, that a U.S. patentee, merely by selling or authorizing the sale of a U.S.-patented article abroad, does not authorize the buyer to import the article and sell and use it in the United States, which are infringing acts in the absence of patentee-conferred authority. Jazz Photo\\'s no-exhaustion ruling recognizes that foreign markets under foreign sovereign control are not equivalent to the U.S. markets under U.S. control in which a U.S. patentee\\'s sale presumptively exhausts its rights in the article sold. A buyer may still rely on a foreign sale as a defense to infringement, but only by establishing an express or implied license—a defense separate from exhaustion, as Quanta holds—based on patentee communications or other circumstances of the sale. We conclude that Jazz Photo\\'s no-exhaustion principle remains sound after the Supreme Court\\'s decision in Kirtsaeng v. John Wiley & Sons, Inc., in which the Court did not address patent law or whether a foreign sale should be viewed as conferring authority to engage in otherwise-infringing domestic acts. Kirtsaeng is a copyright case holding that 17 U.S.C. §\\xa0109(a) entitles owners of copyrighted articles to take certain acts \"without the authority\" of the copyright holder. There is no counterpart to that provision in the Patent Act, under which a foreign sale is properly treated as neither conclusively nor even presumptively exhausting the U.S. patentee\\'s rights in the United States.\\n\\nDomestic exhaustion\\n\\nIn this part of its opinion, the Federal Circuit reaffirmed its Mallincrodt decision and rejected contentions that Quanta had silently overruled it.\\n\\n§\\xa0271 abrogates common-law rule\\n\\nThe court began by distinguishing the Patent Act\\'s and Copyright Act\\'s respective approaches to infringement. in 17 U.S.C. §\\xa0109(a) the Copyright Act says, \"Notwithstanding the provisions of section 106(3),\" defining infringement by selling, a purchaser \"is entitled, without the authority of the copyright owner, to sell or otherwise dispose of the possession\" of a purchased copy of a work. In contrast, the Patent Act contains no exhaustion provision. Therefore, the Patent Act requires a \"conferral of \\'authority\\' by the patentee .\\xa0.\\xa0. in order for the actions listed in §\\xa0271(a) not to constitute infringement.\" This means there must be \"permission from the patentee\" to avoid infringement. The court does not accept exhaustion as a form of \"constructive\" permission. Hence, if the patentee places explicit limits or conditions on its permission, they qualify the scope of the permission. This has the effect of limiting the common law.\\n\\nGeneral Talking Pictures rule applies to \"conditional\" sale\\n\\nThe court turned to the General Talking Pictures decision, which holds \"that Lexmark would not have exhausted its patent rights in those cartridges, upon the manufacturing licensee\\'s sale (the first sale), if a buyer with knowledge of the restrictions resold or reused them in violation of the restrictions.\" Although the government in its amicus curiae brief and defendant Impression argue \"that a different result is required—that Lexmark automatically lost its patent rights—simply because Lexmark sold the Return Program cartridges itself, subject to the same communicated restriction, rather than having left the manufacture and sale to others under license,\" the court does not accept that:\\n\\nWe conclude otherwise, as we did in Mallinckrodt and subsequent decisions. A sale made under a clearly communicated, otherwise-lawful restriction as to post-sale use or resale does not confer on the buyer and a subsequent purchaser the \"authority\" to engage in the use or resale that the restriction precludes. And there is no sound reason, and no Supreme Court precedent, requiring a distinction that gives less control to a practicing-entity patentee that makes and sells its own product than to a non-practicing-entity patentee that licenses others to make and sell the product.\\n\\nQuanta distinguishable and inapplicable\\n\\nThe court turned to the Quanta decision and found it inapplicable to the present issues. \"\\'Quanta did not involve a patentee\\'s sale at all, let alone one subject to a restriction or, more particularly, a single-use/no-resale restriction.\" Rather, Quanta involved a patentee\\'s (LGE\\'s) license to a manufacturer (Intel) that sold to the accused infringer (Quanta). LGE had not limited Intel\\'s license to manufacture the patented product, although it imposed contractual obligations on Intel. \"No conditions limited Intel\\'s authority to sell products substantially embodying the patents.\" The Federal Circuit emphasized: \"There were no patentee sales, and there were no restrictions on the sales made by the licensee.\" Those facts were removed from the case at bar. Thus the Quanta \"Court\\'s discussion of that issue does not undermine Mallinckrodts ruling that a patentee can preserve its patent rights through restrictions on its sales.\" The Federal Circuit also emphasized as significant the failure of the Quanta Court to explicitly repudiate Mallinckrodt despite the fact that in its amicus brief \"the government prominently featured an argument that Mallinckrodt was incorrect and should be repudiated.\"\\n\\nPrior cases\\n\\nThe court then turned to the prior Supreme Court cases. Reviewing them, it found that although they used sweeping language that a patentee\\'s sale of the patented product placed it beyond the reach of the patent, so that no post-sale restriction could be enforced under the patent laws, that language went beyond the actual facts of the cases. First, the sales were in most cases without any condition or restriction on what the buyer might do with the product. Second, in the cases where an explicit condition or restriction was imposed, the case involved a tie-in or a price-fix.\\n\\nThe Court conceded that in the General Electric case, the Supreme Court had said: \"It is well settled, as already said, that where a patentee makes the patented article, and sells it, he can exercise no future control over what the purchaser may wish to do with the article after his purchase. It has passed beyond the scope of the patentee\\'s rights.\" But that case involved an antitrust challenge to GE\\'s distribution of lamps that did not meet that description. The case involved price restrictions on a licensed manufacturer. The Federal Circuit then explained that the word \"settled\" in the Supreme Court\\'s statement had a special, narrow meaning:\\n\"We read that language to deem \\'settled\\' only what was settled in the cited precedents—a patentee\\'s sales without restrictions exhaust patent rights in the item sold.\" Thus, the Supreme Court\\'s sweeping exhaustion language applies precedentially only to cases in which either the sale was without condition or restriction or else the sale was made with a tie-in or price-fixing condition. \"But the Court did not rule that all restrictions on a patentee\\'s sale were ineffective to preserve the patentee\\'s patent-law rights.\"\\n\\nSimilarly, in United States v. Univis Lens Co., the Supreme Court\\'s sweeping language must now be limited to the factual context of the case:\\n\\nMoreover, although some language in Univis,  like language in other decisions in the area, can be taken out of context and read as going beyond the specific restrictions involved, the most the Court ruled, even as to patent law all by itself, was that a vertical price-control restriction was ineffective to preserve patent rights after sales of articles embodying the patents. While Univis is controlling on what it decided on the issues before it, we do not think it appropriate to give broad effect to language in Univis, taken out of context, to support an otherwise-unjustified conclusion here on a question not faced there.\\n\\nThe Federal Circuit therefore drew this conclusion from the past series of Supreme Court cases on exhaustion:\\n\\nFor the foregoing reasons, we think that the best lesson to draw from the Supreme Court\\'s precedents, as applied to the question before us, is that a patentee may preserve its patent rights by otherwise-proper restrictions when it makes and sells patented articles itself and not only when it contracts out manufacturing and sales.\\n\\nPatent law trumps common law\\n\\nThe Federal Circuit returned to the common law and Lord Coke\\'s commentary on it. Again, the court insisted that Congress had overridden the common law\\'s prohibitions on post-sale restraints, in order to promote technological progress:\\n\\n[W]hatever considerations might go into a jurisdiction\\'s choice as to the background rule for personal property in general, lawmaking authorities may reasonably make different choices for particular kinds of property. Notably, as to intellectual property in its various forms, Congress, implementing the Constitution, has long deemed it important to incentivize creation and disclosure through grants to the creator of rights to exclude others for a time.\\xa0.\\xa0.\\xa0. That overriding legislative prescription removes the patented-article sale from the scope of Lord Coke\\'s 1628 description of his country\\'s general judicially fashioned property law.\\xa0.\\xa0.\\xa0. In short, notwithstanding Lord Coke\\'s description of English general personal-property judge-made law, the patent-specific statutory analysis must govern here.\\n\\nLikely effects on public\\n\\nThe court then turned to what it called \"the likely real-world consequences of one answer or another to the exhaustion question presented here.\" The court noted that in Kirtsaeng the Supreme Court had envisioned serious adverse effects on competition unless Coke\\'s 1628 property law rules were followed. The Federal Circuit said that did not apply to patents:\\n\\n[W]e see no basis for predicting the extreme, lop-sided impacts the Court found plausible in Kirtsaeng in different circumstances. Mallinckrodt has been the governing case law since 1992 and has been reiterated in subsequent precedent. And yet we have been given no reliable demonstration of widespread problems not being solved in the marketplace. Given General Talking Pictures, the only question is about patentees\\' ability to do for their own sales what they already can do by contracting out their manufacturing and sales. Regarding the specific scenario we are addressing today—in which the patentee has sought to preserve its patent rights by conditioning its first sale on a single-use/no-resale restriction of which the accused infringer had adequate notice at the time of purchase—we have been given no proof of a significant problem with enforcing patent rights.\\n\\nFurthermore, the Federal Circuit maintained, the conduct challenged here can have benefits. Under Lexmark\\'s program, customers who agree to the restriction pay a lower price than those who do not. It could be that the companies that refill the cartridges use inferior products that could harm the Lexmark machines, which \"could harm Lexmark\\'s reputation.\" To assume that the restrictions are illegitimate would run counter to the trends \"over the last four decades, that have displaced the strict condemnation of various vertical restrictions that characterized\" earlier antitrust and patent-misuse law in the first part of the twentieth century. \"Field-of-use, territorial, and other limitations on intellectual property licenses may serve procompetitive ends by allowing the licensor to exploit its property as efficiently and effectively as possible.\" Therefore, the court concluded it is appropriate to apply to post-sale restrictions the same tolerance that the General Talking Pictures doctrine accords limitations in manufacturing licenses.\\n\\nInternational exhaustion\\n\\nIn this part of its opinion, the Federal Circuit reaffirmed its Jazz Photo opinion and rejected contentions that Kirtsaeng had undermined the basis for Jazz Photo. The Federal Circuit insisted that \"Kirtsaeng says nothing about patent law.\"\\n\\nThe court emphasized the differences between patent law and copyright law. For example, patent law gives patentees an exclusive right to use of the invention but copyright law gives no general exclusionary right as to use (it gives exclusive public performance and display \"use\" rights, but not others). Also, it is much more costly and time-consuming to obtain a patent than a copyright. The court did not explain, however, the way that or other differences between copyrights and patents called for contrary results as to international exhaustion.\\n\\nThe court did say that the US patent statute gives patentees the reward available from \"sales in American markets, not from sales in foreign markets.\" A sale in a foreign market therefore does not furnish a proper basis for finding exhaustion. \"American markets differ substantially from markets in many other countries, and not just because of disparities in wealth that can lead to dramatically different prices\" in this country and abroad (as was the case in Kirtsaeng). \"Government policies differ dramatically, including policies on price regulation and, most particularly, policies on the availability and scope of patent protection.\" The court did not explain further, however, whether and how such dramatic differences in policy applied to the toner cartridges at issue in the present case.\\n\\nThe court then turned to the only Supreme Court case on foreign exhaustion, Boesch v. Graff. In that case, Graff was the assignee of a US patent. Boesch bought the product from a German supplier who had a prior-user right under German law to make and sell the product, because the supplier had begun activity before the application for the German counterpart patent was filed. The US assignee and the inventor had no connection with Boesch. When Graff imported the product into the US, Boesch sued for infringement. The US courts found Boesch liable. The rights that Boesch had under German law did not entitle him to import the product into the US. That is governed by US law. The US patentee had never \"received any royalty or given any license to use the patented article in any part of the United States.\"\\nAccordingly, the court held, a foreign sale does not of its own force authorize importation into the US.\\n\\nThis does not mean, however, that a patentee by its conduct cannot waive its US rights, be estopped from asserting them, or be found to have granted an implied license.\\n\\nThe court expressed concern that overruling Jazz Photo would harm the US drug industry:\\n\\nThere seems to be no dispute that U.S.-patented medicines are often sold outside the United States at substantially lower prices than those charged here and, also, that the practice could be disrupted by the increased arbitrage opportunities that would come from deeming U.S. rights eliminated by a foreign sale made or authorized by the U.S. patentee.\\n\\nFinally, the court rejected a proposal that exhaustion should be presumed unless the patentee express states that it reserves its US rights. Foreign governments might \"prohibit sellers from stating reservations of rights that would make importation into and sale in the United States more difficult.\" Also: \"Intermediary companies between the foreign purchase and the importation into the United States may be created that make it difficult for the U.S. patentee to carry an affirmative burden of proving adequate notice of reservations attached to a foreign-sold article.\"\\n\\nDissenting opinion\\n\\nJudge Dyk, joined by Judge Hughes, dissented from both branches of the court\\'s exhaustion analysis. Judge Dyk summarized his dissent in these terms:\\n\\nI would overrule our decision in Mallinckrodt as inconsistent with governing Supreme Court authority and overrule Jazz Photo to the extent that it imposes a blanket ban on foreign exhaustion. I would recognize foreign exhaustion where the U.S. rights holder has not notified the buyer of its retention of the U.S. patent rights.\\n\\nDomestic exhaustion\\n\\nIn this part of the dissent, Judge Dyk argued that the majority had misunderstood the Supreme Court\\'s exhaustion jurisprudence in order to substitute its own ideas of the proper balance between patent rights and public rights. He began by saying:\\n\\nFirst, I agree with the government that Mallinckrodt was wrong when decided, and in any event cannot be reconciled with the Supreme Court\\'s recent decision in Quanta Computer, Inc. v. LG Electronics, Inc. We exceed our role as a subordinate court by declining to follow the explicit domestic exhaustion rule announced by the Supreme Court.\\n\\nHe argued that since 1850 the Supreme Court has held that a sale by the patentee or its licensee exhausts all patent rights. In such cases, \"The question of whether the seller has \\'authorized\\' the buyer to use or resell the item is simply irrelevant.\" Post-sale restrictions could not be enforced under federal patent law. The only Supreme Court case to depart from that principle was Henry v. A.B. Dick Co., and it was explicitly overruled five years later by Motion Picture Patents Co. v. Universal Film Mfg. Co. The principle of the overruled Dick case that a patentee could impost a post-sale restriction by giving a buyer notice of it was \"the same as the panel\\'s holding in Mallinckrodt and the majority\\'s holding in this case.\"\\n\\nHe insisted that the majority opinion misread the Motion Picture Patents decision by asserting \"that it only \\'held particular restrictions improper\\' .\\xa0.\\xa0. but \\'did not rule that all restrictions on a patentee\\'s sale were ineffective to preserve the patentee\\'s patent-law rights.\\'\" He explained:\\n\\nThat is not accurate. Motion Picture Patents did not leave behind the remnants of A.B. Dick—minus tie-ins and resale price maintenance. To the contrary, the Court in Motion Picture Patents found that \"[t]he patent law furnishes no warrant for\" the restrictions imposed by the patent owner.\\n\\nLater cases, such as Quanta, confirmed this \"broad patent exhaustion rule [in Motion Picture Patents] and left no room for a resurrection of A.B. Dick.\"\\n\\nHe next turned to the majority\\'s referenced to \"conditional sales\" and \"unconditional sales,\" and said that the majority misconstrued the terms. \"Conditional sales,\" he said, as used in pre-Mallinckrodt case law referred only to the retention of title for a security interest in installment purchases. \"In other words, a sale with restrictions could nonetheless be an \\'unconditional\\' sale in which title passes, with the restrictions invalid under the patent laws because of exhaustion.\"\\n\\nHe then criticized the majority for making up special rules for patent cases that differed from the common law and general legal principles, citing Supreme Court admonitions not to do that--\"The Supreme Court has repeatedly instructed us not to ignore traditional legal principles to fashion rules \\'unique to patent disputes.\\'\"\\n\\nFinally, Judge Dyk took issue on multiple grounds with the majority\\'s efforts to distinguish and limit the Supreme Court\\'s rulings. \"The majority\\'s justifications for refusing to follow Supreme Court authority establishing the exhaustion rule misconceive our role as a subordinate court.\" Each justification in the majority decision was unsupportable, he said.\\n\\n \"First, the majority characterizes the statement of the exhaustion rule in the Supreme Court cases as mere dictum because in those cases there was either no restriction imposed or the restriction would otherwise violate the antitrust laws. But the cases impose no such qualification on the rule announced. The Supreme Court has repeatedly advised the courts of appeals that our task is to follow the rules proclaimed by the Court, and not to attempt to distinguish Supreme Court cases on their facts.\"\\n \"Second, the majority relies on 35 U.S.C. §§\\xa0271(a) and 154(a)(1) to suggest that a broad reading of the exhaustion doctrine is inconsistent with statutory language making an act of infringement .\\xa0.\\xa0. any use or sale of a patented invention \\'without authority\\' of the patent owner, and providing the patent owner with a \\'right to exclude.\\'\" But the patent exhaustion doctrine is a limitation on the operation of those sections, and applies notwithstanding them.\\n \"Third, the majority claims that giving full sweep to the articulation of the exhaustion doctrine in Quanta and other cases would be inconsistent with the Supreme Court\\'s decision in General Talking Pictures Corp. v. Western Electric Co. .\\xa0.\\xa0. The majority suggests it would be incongruous if \\'a patentee cannot preserve its patent rights against uses of a patented article .\\xa0.\\xa0. if, instead of licensing someone else to make and sell the article, it chooses to make and sell the article itself.\\'\\xa0\"\\n\\nBut General Talking Pictures was a case of a license to manufacture in a limited field,  not a sale with a post-sale restriction. The cases recognize that distinction. Thus, in Quanta the Supreme Court stated that General Talking Pictures \"held that exhaustion did not apply because the manufacturer had no authority to sell the amplifiers for commercial use.\" And where the manufacturer in that case (Intel) did have a general authority to make and sell, the Supreme Court held that exhaustion applied to the sale.\\n\\nThe majority found \"tension\" between \"the Supreme Court\\'s broad statement of the exhaustion rule and General Talking Pictures\" and sought to resolve it by extending the rule of General Talking Pictures and contracting the exhaustion doctrine in the area of possible conflict. But, Dyk maintained:\\n\\n[I]t is not our task to ignore Supreme Court rulings as \"unjustifi[ed]\" or \"unsound\" because they are purportedly inconsistent with other Supreme Court cases. The distinction between restrictions on sales (impermissible) and restrictions on licensees (permissible) exists in the Court\\'s precedent, and it is not for us to decide if it is a sound distinction.\\n\\n \"Finally, the majority proposes that we should somehow sustain the restriction here because it may be pro-competitive. Exhaustion does not turn on whether a particular post-sale restriction is desirable or undesirable, pro-competitive or anti-competitive, but whether the sale was authorized and the item has passed beyond the scope of the patent monopoly.\" Furthermore, the Supreme Court said in Kirtsaeng that a prohibition on resale is \"manifestly anti-competitive.\"\\n\\nDyk concluded his discussion of domestic exhaustion with the statement: \"There is, in sum, no colorable basis for the majority\\'s failure to follow the exhaustion rule for domestic sales as articulated by the Court in Quanta and numerous other cases.\"\\n\\nInternational exhaustion\\n\\nIn this part of the dissent, Judge Dyk argued for a nuanced balance that called for different results depending on whether the patentee was responsible for the sale abroad that was alleged to trigger exhaustion.\\n\\nHe began by pointing out that because Lexmark\\'s foreign sales were made without any restrictions or reservations, \"even under the majority\\'s cramped view of exhaustion, there is no question that the sales would have exhausted Lexmark\\'s domestic patent rights. The issue is whether the foreign location of the sale should lead to a different result, as we previously held in Jazz Photo.\"\\n\\nHe then turned to \"the centerpiece of the majority\\'s holding that there is a doctrinal blanket ban on foreign exhaustion, namely the Supreme Court\\'s decision in Boesch v. Graff. But \"Boesch announced no such blanket ban,\" he said. \"It did not even involve an authorized sale by the holder of U.S. patent rights but rather a sale by a third party under a foreign law\\'s prior use exception.\" But \"Boesch does not apply here because the foreign sales were made by Lexmark.\"\\n\\nIn every US lower court decision before Jazz Photo: \"When the sale was made by an entity not holding U.S. patent rights, as in Boesch, or when the authorized foreign seller clearly reserved U.S. rights, there was no exhaustion.\" In contrast, \"where the foreign sale was made by a seller holding U.S. patent rights without a contractual reservation of U.S. rights, exhaustion occurred as a result of an authorized foreign sale.\"\\n\\nDyk maintained that \"Kirtsaeng provides significant guidance and cannot be dismissed as simply a copyright case, or as limited to the \\'first sale\\' provision of the Copyright Act.\" Rather, the policies that animated Kirtsaeng typically apply to patent exhaustion. But because in some cases a difference may be significant, there should be abalanced approach. Dyk argued for \"put[ting] the burden on the U.S. rights holder to provide notice of a reservation of U.S. rights to the purchaser.\" Thus, he \"would recognize foreign exhaustion where the U.S. rights holder has not notified the buyer of its retention of the U.S. patent rights.\"\\n\\nSupreme Court\\nIn March 2016, Impression filed a petition for certiorari in the U.S. Supreme Court. Impression presented these questions in its petition:\\n\\n\\xa0\\xa0\\xa01. Whether a \"conditional sale\" that transfers title to the patented item while specifying post-sale restrictions on the article\\'s use or resale avoids application of the patent exhaustion doctrine and therefore permits the enforcement of such post-sale restrictions through the patent law\\'s infringement remedy.\\n\\xa0\\xa0\\xa02. Whether, in light of this Court\\'s holding in Kirtsaeng v. John Wiley & Sons, Inc., 133 S. Ct. 1351, 1363 (2013), that the common law doctrine barring restraints on alienation that is the basis of exhaustion doctrine \"makes no geographical distinctions,\" a sale of a patented article—authorized by the U.S. patentee—that takes place outside of the United States exhausts the U.S. patent rights in that article.\\n\\nOn June 20, 2016, the Court invited the Solicitor General to file briefs in this case expressing the views of the United States. In October 2016, the government filed the requested amicus curiae brief. It recommended grant of certiorari on both questions. The brief argues that the \"Federal Circuit\\'s decision misreads\" the Supreme Court\\'s precedents and \"would substantially erode the exhaustion doctrine.\" The Supreme Court granted certiorari on December 2, 2016 and heard oral argument in the case on March 21, 2017. The Court published its decisions on May 30, 2017.\\n\\nMajority\\nA unanimous Court found that Lexmark exhausted its patent rights upon first sale domestically, even with the single-use/no-resale restrictions imposed by Lexmark in contracts with its customers, although such restrictions could be enforced under contract law. The Court noted that the exhaustion doctrine has a long history and that any change would have significant effects on commerce in the modern world, noting that \"extending the patent rights beyond the first sale would clog the channels of commerce, with little benefit from the extra control that the patentees retain,\" noting that complex modern supply chains can involve large numbers of patents. Chief Justice Roberts, in his opinion, compared the situation to automobile repair shops: \"The business works because the shop can rest assured that, so long as those bringing in the cars own them, the shop is free to repair and resell those vehicles. That smooth flow of commerce would sputter if companies that make the thousands of parts that go into a vehicle could keep their patent rights after the first sale.\"\\n\\nSeven justices joined the Court\\'s opinion extending that reasoning to items imported from abroad. Lexmark had argued, and the Federal Circuit agreed, that sale abroad \"does not trigger patent exhaustion unless the patentee \\'expressly or implicitly transfers or licenses\\' its rights.\" The Court, however, ruled that \"[a]n authorized sale outside the United States, just as one within the United States, exhausts all rights under the Patent Act.\" The Court relied on its 2013 decision in Kirtsaeng v. John Wiley & Sons, Inc. on a nearly identical issue under copyright law. Because the underlying statute was not clear as to its geographical scope, the Court in Kirtsaeng decided that, because the statute was based in the common law exhaustion doctrine, which is not limited in geographic extent, the statute at issue was therefore not intended to be limited to only U.S. sales. Applying the same principle to patent law, which historically has a close connection with copyright law, was \"straightforward\" and \"the bond between [copyright and patent law] leaves no room for a rift on the question of international exhaustion\".\\n\\nPartial dissent\\nJustice Ginsburg dissented from the Court\\'s holding with respect to imported items. Adhering to substantially the same reasoning of her dissent in Kirtsaeng, Justice Ginsburg argued that because patent law is territorial and the sale of an item abroad is \"independent[] of the U.S. patent system, it makes little sense to say that such a sale exhausts an inventor\\'s U.S. patent rights.\" She would have upheld the Federal Circuit\\'s decision that sale abroad does not exhaust a patentee\\'s rights in the United States.\\n\\nCommentary\\n\\nGerstein\\n\\nRobert M. Gerstein concluded that further review in the Supreme Court was likely:\\nGiven the Supreme Court\\'s interest in patent cases, a vigorous dissent in Lexmark that relies on a number of Supreme Court precedents, including Quanta and Kirtsaeng, and the position of the Justice Department that Quanta overruled Mallinckrodt, it would not be surprising to see the Supreme Court take up Lexmark in its next term.\\n\\nDodd and Dowd\\n\\nJeff C. Dodd and Matthew J. Dowd viewed the decision as an affirmation of strong patent rights:\\nLexmark embraces a very strong view of patent rights and a narrow view of the scope of exhaustion. It affirms that patent holders have wide latitude to segment and control distribution in the market channels for products covered by patents. This latitude is particularly wide with respect to limiting the import into the United States of patented goods sold in authorized sales in foreign markets even where restrictions on resale were not proven to have been communicated to foreign buyers. Even so, the court left open the possibility that foreign sales, under the right circumstances, may incorporate an implied license to import and use the product within the United States.\\n\\nCukierski and Masia\\n\\nKevin J. Cukierski and Adam H. Masia see the decision as \"pro-patent owner\" but warn again premature celebration:\\nBut take caution—it is likely that the Supreme Court will be asked to hear the case. Given the tension between this case and the Supreme Court\\'s language in Quanta and Kirtsaeng, along with the discord at the district court level and among commentators before the Federal Circuit\\'s decision, there\\'s a good chance the Supreme Court will do so. Until the Supreme Court has its say, you should take precautions in case the Supreme Court takes an expansive view of patent exhaustion and decides to remove these exceptions.\\n\\n\"Without Precedent\"\\n\\nAnother commentator (unsigned comment) indicated a skeptical view of the Federal Circuit\\'s tendency to march to a different drummer. After quoting Judge Dyk\\'s admonition, \"We exceed our role as a subordinate court by declining to follow the explicit domestic exhaustion rule announced by the Supreme Court,\" he (or she) observed:\\nFor present purposes, it is simply worth noting that the Federal Circuit appears to be inching closer again to the concept that patent law is simply a unique beast, with unique rules and requirements. The Supreme Court has taken a skeptical view of that approach in the past. And may well again.\\n\\nJahn, Pichler, and Lo\\n\\nPaul Jahn, Rufus Pichler and Lincoln Lo raise many questions (mostly about \"clear communication\") about what the Lexmark majority opinion left unresolved:\\n\\n Conflict or tension with Quanta: \"Quanta expressly distinguished implied licenses and exhaustion, holding that disclaimers of license rights are \\'irrelevant\\' where \\'the right to practice the patents is based not on implied license but on exhaustion.\\'\\xa0\" But \"the Federal Circuit appears to treat exhaustion like an implied license—one that the patentee can disclaim by \\'clearly communicate[d]\\' restrictions.\" Quanta appears to hold that the patentee\\'s attempt to impose a post-sale restriction on a manufacturing licensee is ineffective if the license does not conform to the General Talking Pictures case.\\n \"[W]hat arrangement between a seller and buyer is sufficient to deny \\'authority.\\'? It was undisputed in Lexmark that there was \\'an express and enforceable contractual agreement\\' between Lexmark and each end-user, and that the no-resale and no-reuse restrictions were binding on end users.  Yet throughout the Lexmark opinion, the majority suggests that restrictions may be sufficient if \\'clearly communicated\\'—even if well short of a contractual meeting of the minds.\"\\n Another way to put this is what is a \"clear communication\"? In Jazz Photo, the Federal Circuit noted that the \"package instructions [were] not in the form of a contractual agreement by the purchaser to limit reuse of the cameras.\"  Accordingly, \"There was no showing of a \\'meeting of the minds\\' whereby the purchaser, and those obtaining the purchaser\\'s discarded camera, may be deemed to have breached a contract or violated a license limited to a single use of the camera.\" The writers conclude, therefore, \"It is unclear if the Federal Circuit intended an expansion of the patentee-seller\\'s ability to avoid exhaustion.\"\\n Also, how clear must a \"clear communication\" be? \"The Federal Circuit appears to limit infringement claims against subsequent downstream buyers to those \\'having knowledge of the restrictions.\\' The appellate court did not elaborate on what defenses a subsequent downstream purchaser without knowledge may have, assuming no exhaustion. The court only mentions in passing that \\'we do not have before us the questions that would arise, whether under principles governing bona fide purchasers or otherwise, if a downstream re-purchaser acquired a patented article with less than actual knowledge of the restriction.\\'\\xa0\"\\n Finally, does the court\\'s focus on \"clear communication\" have a negative impact on post-sale restrictions that a limited licensee under General Talking Pictures is required to impose? \"The Federal Circuit suggested repeatedly that buyers\\' knowledge of the licensee\\'s field of use limitation may be required for a licensee\\'s sale to be non-exhaustive. While General Talking Pictures did not clearly resolve this question, many licensors have assumed that sales by a licensee outside of its licensed field are unauthorized altogether and are therefore non-exhaustive regardless of the purchaser\\'s knowledge of the field of use limitation.\" Therefore, does the emphasis, here \"on the buyer\\'s knowledge, even if dicta, add to the uncertainty concerning this issue\"?\\n\\nCastanias, Nix, and Kazhdan\\n\\nGregory A. Castanias, Kelsey I. Nix, and Daniel Kazhdan also point to unresolved issues over which patent owners \"must still be cautious\":\\nLexmark explicitly left open several fact-specific questions, including (i) what happens if someone acquires a patented article with \"less than actual knowledge\" of the restrictions placed on the original sale by the patent owner and (ii) when would a foreign buyer have an \"implied license\" to sell in the United States, independent of patent exhaustion. These issues will surely be raised in future cases.\\n\\nCrouch\\n\\nDennis Crouch, in Patently-O commented on the issues and provided a summary of the merits briefs filed in the Supreme Court as of January 31, 2017. Crouch opposed the Federal Circuit\\'s ruling on these grounds:\\nWith personal property courts long ago rejected servitudes (such as use and resale restrictions) that bind subsequent purchasers. Unlike real property, personal property moves and is often transferred without substantial paperwork or record-keeping, and allowing a set of unique restrictions has the potential of gumming up the marketplace.  The Federal Circuit in this case went all the way to the other side — holding that the presumption in foreign sales is that no US patent rights are exhausted.  I purchased my last couple of smart phones through the used market – and have also repaired them several times.  Under the law, I probably should have taken steps to ensure that all of the original equipment manufacturers affirmatively granted repair and resale rights.  Coming together, the Federal Circuit\\'s approach here has the potential to limit the market for the repair and reselling of goods.  I would suggest that those activities are incredibly beneficial to our society in terms of resource allocation and avoiding waste as well as empowering citizens and avoiding anticompetitive market behavior.\\n\\nNotes and references\\n\\nNotes\\n\\nReferences\\n\\nExternal links\\n \\n SCOTUSblog coverage\\n Podcast – Interview with proprietor of Impression Products\\n\\nIntellectual property law\\nUnited States patent case law\\nUnited States Court of Appeals for the Federal Circuit cases\\nUnited States Supreme Court cases\\nUnited States Supreme Court cases of the Roberts Court\\n2015 in United States case law\\n2017 in United States case law\\nLexmark'},\n",
+       "  {'docid': 'doc-en-13',\n",
+       "   'text': 'The Werewolf by Night is the name applied to two fictional characters who are werewolves appearing in American comic books published by Marvel Comics. The Werewolf by Night (usually referred to by other characters simply as the Werewolf) first appeared in Marvel Spotlight #2 (February 1972).\\n\\nPublication history\\nPrior to the formation of the Comics Code Authority in 1954, Marvel\\'s predecessor Atlas Comics published a five-page short story titled \"Werewolf by Night!\" in Marvel Tales #116 (July 1953). With the relaxation of the Comics Code Authority\\'s rules in 1971, it became possible for the first time to publish code-approved comic books with werewolves. The Jack Russell version of Werewolf by Night first appeared in Marvel Spotlight #2 (February 1972) and was based on an idea by Roy Thomas. The series name was suggested by Stan Lee and the initial creative team was Gerry Conway and Mike Ploog, who worked from a plot by Roy and Jeanie Thomas for the first issue. Readers have often pointed out that the lead character\\'s name, Jack Russell, is also a breed of dog. Conway has said that while he cannot remember how he came up with the name, it is unlikely that he was making this canine reference consciously, since he did not own a dog and never lived with one growing up. After the test run in Marvel Spotlight #2-4, the character graduated to his own eponymous series in September 1972. Conway described working on the series as \"a lot of fun\" because the horror genre made a refreshing change from the superhero stories that had been the staple of mainstream comics for years. Werewolf by Night was published for 43 issues and ran through March 1977. During the series\\' run, the editorship could not resist the opportunity to assign one of their most popular writers, Marv Wolfman, to write some stories for the series with a playful note:  \"At last -- WEREWOLF -- written by a WOLFMAN.\"\\n\\nIssue #32 (August 1975) contains the first appearance of the Moon Knight. Jack Russell co-starred with Tigra in Giant-Size Creatures #1 (July 1974), which was the first appearance of Greer Grant Nelson as Tigra instead of as the Cat. That series was retitled Giant-Size Werewolf with its second issue. Jack Russell was dormant for most of the 1980s. The character\\'s appearance was radically revamped in Moon Knight #29 (March 1983). He guest-starred in various issues of Spider-Woman, West Coast Avengers, and Doctor Strange: Sorcerer Supreme. The Werewolf by Night was later revived in the pages of Marvel Comics Presents, where he appeared irregularly from 1991-1993. He made regular appearances as a supporting cast member in the pages of Morbius: The Living Vampire from 1993-1995. A letters page in an issue of Morbius mentioned that a Werewolf by Night miniseries by Len Kaminski and James Fry was in the works, but the miniseries was never published. Werewolf by Night vol. 2 ran for six issues in 1998. The series was written by Paul Jenkins and penciled by Leonardo Manco. After the book\\'s cancellation, the story was continued in the pages of Strange Tales, which also featured the Man-Thing. That volume of Strange Tales was canceled after only two issues due to poor sales. In early 2007, Marvel published a one-shot entitled Legion of Monsters: Werewolf by Night, with art by Greg Land. In January 2009, Jack Russell was featured in the four-issue limited series Dead of Night Featuring Werewolf by Night, from Marvel\\'s mature readers MAX imprint. The series was written by Duane Swierczynski, with art by Mico Suayan. He was featured as a member of Morbius\\' Midnight Sons in Marvel Zombies 4 in 2009.\\n\\nA second Werewolf by Night first appeared in the third volume of Werewolf by Night and was created by Taboo of the Black-Eyed Peas, Benjamin Jackendoff, and Scot Eaton.\\n\\nFictional character biography\\n\\nJack Russell\\n\\nWhile reports of lycanthropy (shapeshifting into a werewolf) in the Russoff line stretch back many centuries, the first confirmed manifestation is Grigori Russoff in 1795. Dracula slew Grigori\\'s wife Louisa after he refused to acknowledge Dracula\\'s primacy upon his return to Transylvania. Grigori then ambushed and destroyed Dracula, but was mutated into a werewolf by Lydia, a werewolf formerly imprisoned by the vampire lord. Grigori took a second wife, but accounts vary as to why lycanthropy failed to pass to his descendants. Sometime prior to May 1930, Grigori\\'s descendant, Gregor, obtained the legendary Darkhold scrolls, binding them back into book form. Reading lycanthropy\\'s origins in the Darkhold under a full moon triggered the dormant curse, mutating Gregor into a werewolf. Gregor further transcribed much of the Darkhold into Grigori\\'s diary, essentially creating a Darkhold copy, which he used as his own diary. Gregor sold part of his estate — including Wundagore Mountain — to Jonathon Drew, who shared it with partner Herbert Wyndham (the future High Evolutionary). The Russoff werewolf slew Jonathon\\'s wife, Merriem, and Wyndham designed a suit of silver-coated armor to protect himself, enabling Russoff\\'s capture. Russoff stayed with the Evolutionary, who kept the werewolf safely contained for decades. Russoff eventually used the Darkhold to summon Chthon to cure him and the Elder God nearly broke through the earthly plane; but Magnus the Sorcerer forced Russoff to banish Chthon, who lashed out with a parting blast that slew Gregor. Despite contrary accounts, the Gregor Russoff who stayed with the High Evolutionary seems to have been the grandfather (or great-grandfather) of Jack Russell. Having the same name and presumably using the same diary contributed to earlier confusion. It would seem more likely that the elder Gregor was the one who transcribed the Darkhold into the diary.\\n\\nDecades later, another Gregor Russoff married Laura, the former girlfriend of his younger brother Philip. Jacob (later Jack) was born in Mediaș, Transylvania, soon after, and Laura was pregnant with Lissa within two years of marriage; however, when lightning struck Russoff\\'s Transylvanian castle during a full moon, the werewolf Gregor escaped confinement and began attacking villagers. They tracked down and killed Russoff with silver bullets. Gregor\\'s mother, Maria, was stoned and driven from the village, living with gypsies and learning magic. After Gregor\\'s death, Laura found Philip - who had moved to Los Angeles and anglicized his name to Russell - and they married after a year; Jack and Lissa remained unaware of Philip\\'s past. Approximately 15 years later, the criminal organization known as the Committee learned of Gregor\\'s curse and blackmailed Philip, threatening to reveal his secrets. To protect Laura\\'s name, Philip paid them, but had second thoughts and canceled payment, causing the Committee to send Max Grant to kill Laura. Critically injured in a car crash on Jack\\'s 18th birthday, Laura barely had time to tell Jack about his true father and the curse of the werewolf, making Jack promise not to harm Philip, before dying. Having inherited lycanthropy the night before, Jack slew Grant, but wrongly blamed Philip for some time. Laura left Castle Russoff in Jack\\'s name, but Philip, the trustee, sold the castle to Miles Blackgar, who had it moved to an island off the California coast. Jack battled a motorcycle gang, infecting its members with lycanthropy.\\n\\nJack spent the next few years as a traveler, shapeshifting on the three nights of the full moon into a savage werewolf form. He learned of the Darkhold from Nathan and Agatha Timly, who briefly kidnapped the Werewolf and met grisly ends. Befriending writer Buck Cowan, Jack sneaked into Blackgar\\'s castle and stole the Darkhold, encountering Miles Blackgar and his daughter Marlene, whose petrifying power slew both Blackgars. After fighting off the deformed Cephalos\\' plot to drain his power to stabilize Cephalos\\' form, Jack had Father Ramon Joaquez translate the Darkhold. The priest died after being possessed by the Darkhold\\'s former custodian, the 12th-century mad monk Aelfric, and the indestructible Darkhold vanished. Jack encountered Joshua Kane, who hunted the Werewolf, and his brother, Luther Kane, who offered to prevent Lissa from mutating into a werewolf in exchange for Jack kidnapping billionaire-turned recluse Judson Hemp; he met mentalist Swami Rihva, who sought the Werewolf\\'s blood to reveal the treasure map of the ancient sorcerer Kaman-Ru on his \"Bloodstone\"; the possessing demon Krogg; and Spider-Man and Moondark the Magician. Jack then fought the sonic-weapons of Sarnak, his first brush with the criminal organization known as the Committee, who wished to enslave the Werewolf.\\n\\nAfter fighting the sociopathic Hangman (Harlan Krueger), Jack was entranced by Topaz, the familiar of the sorcerer Taboo, who sought the Darkhold. Taboo had used the tome decades before to grant his son, Algon, a golden touch, but had lost the book in mid-spell, trapping Algon in a mindless state. Lacking the Darkhold, Taboo transferred Philip Russell\\'s mind into Algon, but both Algon and Taboo died, restoring Philip, who explained Laura\\'s death and reconciled with both Jack and Lissa. Traveling to Transylvania alongside Topaz, with whom he had bonded, Jack discovered the Russoff diary/Darkhold copy, the Werewolf battled Dracula and the book was lost in the Alps. Jack and Topaz encountered the kyphotic Half-Mad before returning to the U.S. and Jack fought the Committee\\'s Behemoth robot and then Ma Mayhem, assisted by werewolf Raymond Coker. Jack joined the newly-mutated Tigra against HYDRA, battled vampires Louis Belski and Liza Pyne, opposed Ma Mayhem and her ally Baron Thunder, and joined Coker against Lou Hackett (a corrupt policeman who could also shapeshift into a werewolf by using a magic ring), who was killed in the struggle. The Werewolf joined the Frankenstein Monster against the Satanic Brotherhood of Baal who had abducted Lissa, then fought the disfigured Atlas and the Jekyll/Hyde-like DePrayve. Jack briefly returned to Transylvania following Topaz\\'s psychic summons and encountered Maria Russoff, who used Gypsy magic to raise zombies to slay the villagers who had driven her off. Maria sacrificed herself to save Jack from her zombies upon learning that he was her grandson.\\n\\nIn Blackgar\\'s castle, the Werewolf, Topaz and the repentant spirit fragment of Taboo battled the necromancer Doctor Glitternight, who mutated Lissa Russell into a were-demoness; the process of curing Lissa purged her of the threat of lycanthropy, though she would still pass it on to her children. After battling Morbius, the Living Vampire and slaying the demon worshipped by Brad Wrangle, the Werewolf was briefly transported to the divided dimension Biphasia by Satanist Joaquin Zaire and he aided Paingloss against the sorcerer Sardanus. During a subsequent ski trip, the Werewolf nearly slew Buck Cowan, after which he was captured by the Committee-paid mercenary known as the Moon Knight, who set him free when he realized Jack\\'s humanity and the Committee\\'s intentions. The Werewolf then joined the Ghost Rider, the Man-Thing, and Morbius in unwittingly slaying the benevolent alien Starseed, who had intended to cure them all.\\n\\nThe Werewolf, Topaz and others then battled and were nearly driven mad by the ghost of 19th-century black magician Belaric Marcosa, but they freed the trapped spirits of Marcosa\\'s victims, who destroyed him, and one of the grateful spirits, that of magician Gideon Blaine, healed Buck. The enigmatic Three Who Are All (the Hooded One, the Burning Snake and the Goat Child) — an ancient extra-dimensional group who had formerly included Glitternight and a fifth member, Fire-Eyes — sent Jack, Topaz, Raymond Coker and Brother Voodoo to Haiti, where the Werewolf and Fire-Eyes destroyed Glitternight once and for all. In the process, Jack gained control of his Werewolf persona, though he still only shapeshifted under moonlight and still lost control during the three nights of the full moon.\\n\\nThe Werewolf joined with Iron Man against the Maggia\\'s Masked Marauder and his Tri-Animan and he teamed with Spider-Woman against the mercenary the Enforcer. The mad scientist Dr. Karl Malus captured and performed scientific experiments on Russell to control him and use him against Spider-Woman; Russell escaped and apprehended Malus with the aid of Spider-Woman. Russell joined Spider-Man against the Tatterdemalion, a former agent of Sarnak. After being temporarily captured alongside a number of costumed adventurers by the Locksmith and Tick-Tock, Russell began mutating into a more savage and lupine form, a late effect from Malus\\' treatment. He fled Satanists Morning Star (Schuyler Belial) and his Left Hand Path, who wished to use his blood to mutate into werewolves, then sought aid from the now-human Michael Morbius in controlling his savage self, leading to a battle with the West Coast Avengers. With assistance from Iron Man, he later saved Lissa from Morgan Le Fay\\'s attempt to possess her.\\n\\nHe was subsequently mind-controlled into joining the mostly-criminal Night Shift by Dansen Macabre. Russell was the only member who knew their leader, the Shroud, was using the group to oppose other criminals and to prevent them from harming innocents. After encounters with Captain America, the Moon Knight and the Avengers, the Werewolf eventually developed resistance to Macabre\\'s powers and turned on the Night Shift, after which he went solo. After briefly battling the Hulk in the Midwest, Jack contacted his father Gregor\\'s spirit to cure his lycanthropy, but was told that he would die unless he accepted his beast. During the ensuing battle with the religious zealot Silver Dagger and the Braineaters, a cult of werewolves mutated in the past by Russell, Jack fully accepted his wolf-self and his personae merged, altering his powers and granting him full control and the best of both selves.\\n\\nRussell assisted Doctor Strange against the alien Possessors, the Night Shift against an L.A. street gang and the Ghost Rider against a new group of Braineaters; Jack battles with Sabretooth but before Jack can kill Sabretooth three locals show up with rifles and save Sabretooth by shooting at Jack. and fought an unidentified Wendigo in Canada. Russell was captured by criminal scientist Nightshade who used his blood to create the Night Patrol, a group of werewolves in Starkesboro, Massachusetts. Captain America - also mutated into a werewolf - freed Russell and led the werewolves to defeat Nightshade\\'s master, Dredmund the Druid, who had used the Godstone (the former gem of the Man-Wolf) to briefly mutate into the powerful Starwolf. The Night Patrol was cured, after which Russell was drawn into a conflict involving the Midnight Sons and was slain by Switchblade (the insane Darkhold-powered Blade), but Jack was revived once Professor Louise Hastings broke Switchblade\\'s spell. Russell befriended the again pseudo-vampiric (and now demon-possessed) Morbius, had a vision of advertisements on the moon causing mass insanity and fought the Lilin Goblins, Mr. Hyde and the sadist Morphine. Jack had an affair with Morbius\\' possessed former girlfriend Martine Bancroft.\\n\\nJack again began losing control of the Werewolf, locking himself in a cage while under the full moons, and even glimpsing visions of Hell as he shapeshifted. From the Cult of the Third Moon\\'s dying leader, Walter Clark, Russell learned that only the legendary Wolfblade could control his lupine self. With the aid of Smedley, a mysterious benefactor, Russell recovered all three parts of the Wolfblade, battled the original Wolf Demon in a branch of Hell, completed the puzzle by reaccepting both selves and seemingly regained control. However, after Jack visited friends Freddie and the disfigured Lump, Smedley sent him to investigate a series of killings in which the evidence pointed to Jack as the killer. As Russell began to mutate further, Smedley said Jack just had not been careful enough in his wish to be freed from the Wolf Demon and that he must embrace the disease, or it would destroy him. Uncertain how to accomplish this, Jack found a confidant in the Lump, who cared for the Werewolf as he hid out in the sewers. While Jack\\'s new girlfriend, Roxanna, remained blissfully unaware of his dual existence, the Werewolf was tracked down by a pair of detectives, escaping only after they were slain by the Cult of the Third Moon. Though Jack\\'s subsequent fate is unknown, he was later seen sensing the arrival of the mystic assassin Hellphyr.\\n\\nIn the Legion of Monsters: Werewolf by Night one-shot, Jack Russell came to Salvage, Alabama to save a family of law-abiding werewolves from a group of townsfolk led by Cal Escher. Young Rhonda was the only one left in the family after her mother and her sister Suzie chose death by gun or knife. The girl was drowning her sorrows in Sullivan\\'s bar next to the cemetery when the gang attacked her, revealing her werewolf nature by means of a tarot card (\"The Moon\") and then trying to kill her. Russell interfered, mutating into the Werewolf while Rhonda decided to do the same. After killing the violent gang, Russell and Rhonda left the town, determined to control their afflictions and live their lives without fear.\\n\\nThe Moon Knight rescues Jack from a criminal enterprise wherein samples of his blood are used to temporarily mutate homeless people into pseudo-werewolves, who are then provoked into fighting each other as a spectator sport. The Moon Knight frees Jack, who has degenerated into a near-mindless feral state, from his captors; the Werewolf proceeds to go on a rampage, attacking both his tormentors and the Moon Knight, who subdues him before restoring his freedom to him.\\n\\nThe Werewolf appears as part of the new Midnight Sons team to hunt down zombies who escaped A.R.M.O.R. headquarters and prevent the contagion from spreading. Prior to the team\\'s mission, he records a video will and testament telling his sister that he is happy in life. He was given a vaccine developed by Morbius, the Living Vampire. In their search for the missing zombie Deadpool, the team battles and kills zombie Men-Fish and their leader, the Piranha. After battling the Hood\\'s Night Shift and watching ally the Man-Thing seemingly die in a battle against Deadpool, Russell\\'s vaccine fails him and he becomes a zombie. He then confronts Jennifer Kale. He battles Morbius, who realizes that Jack\\'s werewolf form is not subject to the virus, and Jennifer Kale summons a moonlight spell to mutate him into the Werewolf. Jack is later restored to normal by Morbius, who developed a cure for the zombie virus using Spider-Man\\'s blood and samples of the zombie virus from different realities.\\n\\nAfter the death of Frank Castle, Morbius pieces the Punisher together again and enlists his help in saving monsters from extinction. Jack Russell, the Man-Thing and the Living Mummy are part of the Legion of Monsters, who fight those who would wipe out all monsters. The Punisher aids this group in protecting an underground city that has many innocent, sentient monsters.\\n\\nRussell appears among many mystical beings of lupine and feline nature drawn to the headquarters of X-Factor Investigations by the imminent birth of the mutant Wolfsbane\\'s child. While many of the gathered beings wish to acquire the child for their own ends, Russell seems intent on protecting mother and child. Once the child is born, it is rejected by a shaken Wolfsbane due to its vicious, feral nature and her own religious beliefs. The cub appears to be caught up in a convergence of the mystic forces seeking it, vanishing explosively from the Earth; however, Russell finds the child hiding in a cave and takes it under his care.\\n\\nDeadpool later discovered that Russell had an affair with his wife Shiklah. Deadpool then promptly blew off Jack\\'s head with a blunderbuss, but Shiklah revealed that Jack would survive.\\n\\nJake Gomez\\n\\nJake Gomez is a boy of Hopi descent who underwent his first werewolf transformation at the age of 13 due to a curse in his family. He was able to get control over his werewolf form with the help of his grandmother Rora and his sister Molly where the music helped with controlling his emotions unlike how Jake\\'s father operated. Jake first appeared when hunters were hunting rabbits on Hopi tribal lands and fought them off. Rora advised him to be careful in the mission at Life Pharmaceuticals where he worked in the day due to the government going after teenage superheroes. Jake attacks two vehicles leaving Life Pharmaceuticals and finds that one of them contains the people who have gone missing from the Rez in the past month. He is then confronted by three monstrous figures with cybernetic parts on them.\\n\\nPowers and abilities\\nJack Russell is a descendant of the mystically-altered offshoot of humans known as Lycanthropes. During the night of the full moon and the two nights surrounding it he is forced to mutate into a werewolf, a large, powerful form which is a hybrid of human and wolf, and loses his human intellect. Through a series of events, he is also capable of mutating voluntarily outside of the full moon, at which time he remains in control of himself. As a werewolf, Jack gains the proportionate physical advantages of a nearly  wolf. In this form, he possesses superhuman strength, speed, stamina, durability, agility and reflexes. He possesses a superhuman sense of smell, which carries over to his human form. He has razor-sharp teeth and claws that are able to rend light metals. The Werewolf is resistant to many forms of conventional injury and very difficult to kill by conventional means. Though he can be severely wounded, he recovers from non-fatal wounds much faster than a human would. He is vulnerable to magical attacks and, like all werewolves, he can be killed by weapons made of silver, due to its inherent mystical \"purity\".\\n\\nJake Gomez has the same transformation abilities like Jack Russell.\\n\\nOther versions\\nIn Marvel\\'s Earth-666, a variation of the Jack Russell version of the Werewolf appeared in Supernatural Tourbook and Supernaturals #1-4.\\n\\nIn the Marvel Adventures continuity, Jack Russell\\'s family home is in Queens, New York. This brings him into conflict with Spider-Man after he reluctantly mutates the somewhat-innocent Flash Thompson into a werewolf. Fortunately, Dr. Strange\\'s knowledge of lycanthropy saves Flash.\\n\\nDuring \"Infinity Wars\", when the universe was folded, Jack Russell got fused with Norman Osborn to create the Goblin by Night. Norman Russell was cursed to be the Goblin by Night, killed Ben and May Spector and nearly killed Peter Spector, leaving Peter to become the ArachKnight. During a battle with Peter, Norman got injured and got saved by his son, Harry Russell. While Harry was taking care of his father, Norman lost control and bit Harry, passing the curse on to him. Harry, now as the new Goblin by Night, starts using the glider that Peter built prior to him to becoming the Goblin, leaving Norman free from the curse, being forgiven by Peter and deciding to find a way to cure Harry.\\n\\nIn other media\\n\\nTelevision\\n The Jack Russell incarnation of Werewolf by Night appears in The Super Hero Squad Show episode \"This Man-Thing, This Monster!\", voiced by Rob Paulsen. After his girlfriend, Ellen, is kidnapped by an army of mummies led by N\\'Kantu, the Living Mummy on Dracula\\'s behalf, the Werewolf by Night joins forces with the Man-Thing and a dimensionally-displaced Iron Man to rescue her. While they succeed in defeating the vampire and the mummies, they learn Ellen had been turned into a vampiress. Taking inspiration from Iron Man, Werewolf by Night, Ellen, and the Man-Thing form the Supernatural Hero Squad to defend their town from future monster attacks.\\n The Jack Russell incarnation of Werewolf by Night appears in the Ultimate Spider-Man episodes \"Blade\" and \"The Howling Commandos\", voiced by Ross Lynch. This version is a member of the Howling Commandos. Werewolf by Night and the Howling Commandos join forces with former member Blade and Spider-Man to retrieve a powerful ankh from Dracula.\\n The Jack Russell incarnation of Werewolf by Night appears in Hulk and the Agents of S.M.A.S.H., voiced by Nolan North. This version is a member of the Howling Commandos. In the episode \"Hulking Commandos\", he and the Howling Commandos are assigned to apprehend the agents of S.M.A.S.H., only to join forces with them to defeat Dormammu. In the episode \"Planet Monster: Part 2\", the Werewolf by Night joins the Howling Commandos in helping the agents of S.M.A.S.H. and the Avengers combat the Supreme Intelligence\\'s forces.\\n The Werewolf by Night\\'s unnamed grandfather appears in \"Days of Future Smash, Part 3: Dracula\", also voiced by North. While operating in 1890, he helps Frankenstein\\'s Monster, N\\'Kantu the Living Mummy, and a time-traveling Hulk thwart the Leader and Dracula\\'s plan to blanket the Earth in darkness with their Gamma Furnace.\\n A Halloween special based on Werewolf by Night is in development for Disney+. On November 4, 2021, actor Gael García Bernal was cast. On January  11, 2022, Laura Donnelly was cast in a undisclosed role. Michael Giacchino is set to direct Werewolf by Night.\\n\\nFilm\\n A film adaptation of Werewolf by Night, written by Robert Nelson Jacobs, was announced and due to begin filming in 2005. However, no further developments took place since.\\n\\nVideo games\\n The Jack Russell incarnation of Werewolf by Night makes a cameo appearance in Jill Valentine\\'s ending in Marvel vs. Capcom 3: Fate of Two Worlds and Ultimate Marvel vs. Capcom 3.\\n The Jack Russell incarnation of Werewolf by Night appeared as an unlockable playable character in Marvel Super Hero Squad Online.\\n The Jack Russell incarnation of Werewolf by Night appeared as an unlockable playable character in Marvel Avengers Academy. He could be first recruited during the event \"Avengers Halloween Event 2017\".\\n\\nReception\\nThe Werewolf by Night was ranked #6 on a listing of Marvel Comics\\' monster characters in 2015.\\n\\nCollected editions\\n Essential Werewolf by Night \\n Vol. 1 collects Marvel Spotlight #2-4, Werewolf By Night #1-21, Marvel Team-Up #12, Giant-Size Creatures #1, and The Tomb of Dracula #18, 576 pages, October 2005,   \\n Vol. 2 collects Werewolf By Night  #22-43, Giant-Size Werewolf #2-5, and Marvel Premiere #28, 576 pages, November 2007, \\n Essential The Tomb of Dracula Vol. 1 includes Werewolf by Night #15, 560 pages, 2004,  \\n Essential Monster of Frankenstein includes Giant-Size Werewolf #2, 496 pages, October 2004,  \\n Essential Moon Knight Vol. 1 includes Werewolf by Night #32-33, 560 pages, March 2006, \\n Werewolf by Night: In the Blood includes Werewolf by Night vol. 2 #1-4 \\n Werewolf by Night: The Complete Collection \\n Vol. 1: Marvel Spotlight #2-4, Werewolf by Night #1-15, Marvel Team-Up #12, Tomb of Dracula #18 (October 17, 2017)\\n Vol. 2: Werewolf by Night #16-30, Giant-Size Creatures #1, Giant-Size Werewolf #2-4, material from Monsters Unleashed #6-7 (February 13, 2018)\\n Vol. 3: Werewolf by Night #31-43, Giant-Size Werewolf #5, Marvel Premiere #28, Spider-Woman #6, 19, 32, Marvel Team-Up #93, Ghost Rider #55, Moon Knight #29-30, material from Marvel Premiere #59 (May 15, 2018)\\n\\nReferences\\n\\nExternal links\\n Werewolf by Night at Marvel.com\\n \\n Werewolf by Night appearances in publication order\\n\\n1972 comics debuts\\nCharacters created by Gerry Conway\\nCharacters created by Mike Ploog\\nCharacters created by Roy Thomas\\nComics about werewolves\\nComics by Doug Moench\\nComics by Gerry Conway\\nComics by Marv Wolfman\\nComics characters introduced in 1972\\nDefunct American comics\\nFictional characters with superhuman senses\\nFictional Romanian people\\nFictional werewolves\\nHorror comics\\nMarvel Comics characters who are shapeshifters\\nMarvel Comics characters who can move at superhuman speeds\\nMarvel Comics characters with accelerated healing\\nMarvel Comics characters with superhuman strength\\nMarvel Comics superheroes\\nMarvel Comics titles\\nMidnight Sons'},\n",
+       "  {'docid': 'doc-en-14',\n",
+       "   'text': 'The 2021 NHL Entry Draft was the 59th NHL Entry Draft. The draft was held on July 23–24, 2021, delayed by one month from its normally scheduled time of June due to the COVID-19 pandemic and the later-than-normal finish of the 2020–21 NHL season. It was thus the first draft held in July since 2005. For the second year in a row, the event was held in a remote format, with teams convening via videoconferencing, and Commissioner Gary Bettman announcing the selections in the opening round and deputy commissioner Bill Daly in all subsequent rounds from the NHL Network studios in Secaucus, New Jersey.\\n\\nThe first three selections were Owen Power going to the Buffalo Sabres, Matty Beniers being selected by the Seattle Kraken, and Mason McTavish being picked by the Anaheim Ducks.\\n\\nEligibility\\nIce hockey players born between January 1, 2001, and September 15, 2003, were eligible for selection in the 2021 NHL Entry Draft. Additionally, un-drafted, non-North American players born in 2000 were eligible for the draft; and those players who were drafted in the 2019 NHL Entry Draft, but not signed by an NHL team and who were born after June 30, 2001, were also eligible to re-enter the draft.\\n\\nDraft lottery\\nFrom the 2012–13 NHL season up to the 2020–21 NHL season all teams not qualifying for the Stanley Cup playoffs have had a \"weighted\" chance at winning the first overall selection. Beginning with the 2014–15 NHL season, the league changed the weighting system that was used in previous years. Under the new system, the odds of winning the draft lottery for the four lowest finishing teams in the league decreased, while the odds for the other non-playoff teams increased. The draft lottery took place on June 2, 2021. After changing the number of lottery drawings earlier in the season, the first two picks overall in this draft were awarded by lottery. The Buffalo Sabres and Seattle Kraken won the two draft lotteries that took place on June 2, 2021, giving them the first and second picks overall. Buffalo retained the first pick, while Seattle moved up one spot and Anaheim dropped one spot to third overall.\\n\\nThe expansion Seattle Kraken had the same odds of winning the lottery as the team that finished with the third fewest points (this ended up being the New Jersey Devils). Because the Arizona Coyotes\\' 2021 first-round pick was forfeited as the result of a penalty sanction due to violations of the NHL Combine Testing Policy during the 2019–20 NHL season, Arizona\\'s lottery odds were instead listed as re-draws.\\n\\n{| class=\"wikitable\"\\n|+ Complete draft position odds\\n! Team\\n! 1st\\n! 2nd\\n! 3rd\\n! 4th\\n! 5th\\n! 6th\\n! 7th\\n! 8th\\n! 9th\\n! 10th\\n! 11th\\n! 12th\\n! 13th\\n! 14th\\n! 15th\\n! 16th\\n|-\\n! Buffalo\\n| style=\"background:#A9D0F5;\"| 16.6% || 15.0% || 68.4% || || || || || || || || || || || || ||\\n|-\\n! Anaheim\\n| 12.1% || 11.7% || style=\"background:#DDDDDD;\"| 26.9% || 49.3% || || || || || || || || || || || ||\\n|-\\n! Seattle\\n| 10.3% || style=\"background:#F5A9BC;\"| 10.2% || 4.7% || 39.3% || 35.6% || || || || || || || || || || ||\\n|-\\n! New Jersey\\n| 10.3% || 10.2% || || style=\"background:#DDDDDD;\"| 11.5% || 43.9% || 24.2% || || || || || || || || || ||\\n|-\\n! Columbus\\n| 8.5% || 8.6% || || || style=\"background:#DDDDDD;\"| 20.6% || 45.8% || 16.5% || || || || || || || || ||\\n|-\\n! Detroit\\n| 7.6% || 7.8% || || || || style=\"background:#DDDDDD;\"| 30.0% || 43.8% || 10.9% || || || || || || || ||\\n|-\\n! San Jose\\n| 6.7% || 6.9% || || || || || style=\"background:#DDDDDD;\"| 39.7% || 39.7% || 6.9% || || || || || || ||\\n|-\\n! Los Angeles\\n| 5.8% || 6.0% || || || || || || style=\"background:#DDDDDD;\"| 49.4% || 34.5% || 4.3% || || || || || ||\\n|-\\n! Vancouver\\n| 5.4% || 5.7% || || || || || || || style=\"background:#DDDDDD;\"| 58.6% || 28.0% || 2.4% || || || || ||\\n|-\\n! Ottawa\\n| 4.5% || 4.8% || || || || || || || || style=\"background:#DDDDDD;\"| 67.7% || 21.8% || 1.2% || || || ||\\n|-\\n! Arizona\\n| 3.1% || 3.3% || || || || || || || || || style=\"background:#DDDDDD;\"| 75.9% || 17.1% || 0.7% || || ||\\n|-\\n! Chicago\\n| 2.7% || 2.9% || || || || || || || || || || style=\"background:#DDDDDD;\"| 81.7% || 12.4% || 0.3% || ||\\n|-\\n! Calgary\\n| 2.2% || 2.4% || || || || || || || || || || || style=\"background:#DDDDDD;\"| 87.0% || 8.4% || 0.1% ||\\n|-\\n! Philadelphia\\n| 1.8% || 2.0% || || || || || || || || || || || || style=\"background:#DDDDDD;\"| 91.3% || 4.9% || >0.0%\\n|-\\n! Dallas\\n| 1.4% || 1.5% || || || || || || || || || || || || || style=\"background:#DDDDDD;\"| 95.0% || 2.1%\\n|-\\n! NY Rangers\\n| 1.0% || 1.1% || || || || || || || || || || || || || || style=\"background:#DDDDDD;\"| 97.9%\\n|}\\n\\nTop prospects\\nSource: NHL Central Scouting (May 27, 2021) ranking.\\n\\nSelections by round\\nThe order of the 2021 Entry Draft is listed below.\\n\\nRound one\\n\\nNotes\\n The Vancouver Canucks\\' first-round pick went to the Arizona Coyotes as the result of a trade on July 23, 2021, that sent Oliver Ekman-Larsson and Conor Garland to Vancouver in exchange for Jay Beagle, Loui Eriksson, Antoine Roussel, a second-round pick in 2022, a seventh-round pick in 2023 and this pick.\\n The Arizona Coyotes\\' first-round pick was forfeited as the result of a penalty sanction due to violations of the NHL Combine Testing Policy during the 2019–20 NHL season. The penalty includes the forfeiture of a second-round pick in 2020 and this pick.\\n The Chicago Blackhawks\\' first-round pick went to the Columbus Blue Jackets as the result of a trade on July 23, 2021, that sent Seth Jones, Tampa Bay\\'s first-round-pick in 2021 (32nd overall) and a sixth-round pick in 2022 to Chicago in exchange for Adam Boqvist, a second-round pick in 2021 (44th overall), a conditional first-round pick in 2022 and this pick.\\n The Philadelphia Flyers\\' first-round pick went to the Buffalo Sabres as the result of a trade on July 23, 2021, that sent Rasmus Ristolainen to Philadelphia in exchange for Robert Hagg, a second-round pick in 2023 and this pick.\\n The Dallas Stars first-round pick went to the Detroit Red Wings as the result of a trade on July 23, 2021, that sent Washington\\'s first-round pick, the Rangers\\' second-round pick and Ottawa\\'s fifth-round pick all in 2021 (23rd, 48th and 138th overall) to Dallas in exchange for this pick.\\n The Edmonton Oilers\\' first-round pick went to the Minnesota Wild as the result of a trade on July 23, 2021, that sent a first-round pick and Pittsburgh\\'s third-round pick both in 2021 (22nd and 90th overall) to Edmonton in exchange for this pick.\\n The Minnesota Wild\\'s first-round pick went to the Edmonton Oilers as the result of a trade on July 23, 2021, that sent a first-round pick in 2021 (20th overall) to Minnesota in exchange for Pittsburgh\\'s third-round pick both in 2021 (90th overall) and this pick.\\n The Washington Capitals\\' first-round pick went to the Dallas Stars as the result of a trade on July 23, 2021, that sent a first-round pick in 2021 (15th overall) to Detroit in exchange for the Rangers\\' second-round pick and Ottawa\\'s sixth round pick both in 2021 (48th and 138th overall) and this pick.\\nDetroit previously acquired this pick as the result of a trade on April 12, 2021, that sent Anthony Mantha to Washington in exchange for Richard Panik, Jakub Vrana, a second-round pick in 2022 and this pick.\\n The Toronto Maple Leafs\\' first-round pick went to the Columbus Blue Jackets as the result of a trade on April 11, 2021, that sent Nick Foligno and Stefan Noesen to Toronto in exchange for a fourth-round pick in 2022  and this pick.\\n The Pittsburgh Penguins\\' first-round pick went to the Minnesota Wild as the result of a trade on February 10, 2020, that sent Jason Zucker to Pittsburgh in exchange for Alex Galchenyuk, Calen Addison and this pick (being conditional at the time of the trade). The condition – Minnesota will receive a 2021 first-round pick at Pittsburgh\\'s choice if the Penguins fail to qualify for the 2020 Eastern Conference First Round – was converted on August 12, 2020, when the Penguins elected to defer the pick to 2021.\\n The Carolina Hurricanes\\' first-round pick went to the Nashville Predators as the result of a trade on July 23, 2021, that sent Los Angeles and Nashville\\'s second-round picks both in 2021 to Carolina in exchange for this pick.\\n The New York Islanders\\' first-round pick went to the New Jersey Devils as the result of a trade on April 7, 2021, that sent Kyle Palmieri and Travis Zajac to New York in exchange for A. J. Greer, Mason Jobst, a conditional fourth-round pick in 2022 and this pick.\\n The Tampa Bay Lightning\\'s first-round pick went to the Chicago Blackhawks as the result of a trade on July 23, 2021, that sent a first and second-round pick both in 2021 (12th and 44th overall) to Columbus in exchange for Seth Jones, a sixth-round pick in 2022 and this pick.\\nColumbus previously acquired this pick as the result of a trade on April 10, 2021, that sent Brian Lashoff to Tampa Bay in exchange for a third-round pick in 2022 and this pick.\\n\\nRound two\\n\\nNotes\\n The New Jersey Devils\\' second-round pick went to the Detroit Red Wings as the result of a trade on July 24, 2021, that sent a second-round pick and Tampa Bay\\'s fourth-round pick both in 2021 (38th and 128th overall) to Vegas in exchange for this pick.\\nVegas previously acquired this pick as the result of a trade on July 29, 2019, that sent Nikita Gusev to New Jersey in exchange for a third-round pick in 2020 and this pick.\\n The Columbus Blue Jackets\\' second-round pick went to the Arizona Coyotes as the result of a trade on December 26, 2020, that sent Derek Stepan to Ottawa in exchange for this pick.\\nOttawa previously acquired this pick as the result of a trade on February 23, 2019, that sent Ryan Dzingel and Calgary\\'s seventh-round pick in 2019 to Columbus in exchange for Anthony Duclair, a second-round pick in 2020, and this pick.\\n The Detroit Red Wings\\' second-round pick went to the Vegas Golden Knights as the result of a trade on July 24, 2021, that sent New Jersey\\'s second-round pick in 2021 (36th overall) to Detroit in exchange for Tampa Bay\\'s fourth-round pick in 2021 (128th overall) and this pick.\\n The San Jose Sharks\\' second-round pick went to the Ottawa Senators as the result of a trade on September 13, 2018, that sent Erik Karlsson and Francis Perron to San Jose in exchange for Chris Tierney, Dylan DeMelo, Josh Norris, Rudolfs Balcers, a conditional second-round pick in 2019, a conditional l first-round pick in 2019 or 2020, a conditional first-round pick no later than 2022, and this pick (being conditional at the time of the trade). The condition – Ottawa will receive a second-round pick in 2021 if Karlsson re-signs with the Sharks for the 2019–20 NHL season and the Sharks do not make the 2019 Stanley Cup Finals – was converted on June 17, 2019, when Karlsson re-signed with San Jose for the 2019–20 NHL season.\\n The Los Angeles Kings\\' second-round pick went to the Carolina Hurricanes as the result of a trade on July 23, 2021, that sent Carolina\\'s first-round pick (27th overall) in 2021 to Nashville in exchange for a second-round pick (51st overall) and this pick.\\nNashville previously acquired this pick as the result of a trade on July 1, 2021, that sent Viktor Arvidsson to Los Angeles in exchange for a third-round pick in 2022 and this pick.\\n The Ottawa Senators\\' second-round pick went to the Los Angeles Kings as the result of a trade on July 24, 2021, that sent St. Louis\\' second-round pick and a fifth-round pick both in 2021 (49th and 136th overall) to Ottawa in exchange for this pick.\\n The Chicago Blackhawks\\' second-round pick went to the Carolina Hurricanes as the result of a trade on July 23, 2021, that sent Jake Bean to Columbus in exchange for this pick.\\nColumbus previously acquired this pick as the result of a trade on July 23, 2021, that sent Seth Jones, Tampa Bay\\'s first-round-pick in 2021 (32nd overall) and a sixth-round pick in 2022 to Chicago in exchange for Adam Boqvist, a first-round pick in 2021 (12th overall), a conditional first-round pick in 2022 and this pick.\\n The New York Rangers\\' second-round pick went to the Dallas Stars as the result of a trade on July 23, 2021, that sent a first-round pick in 2021 (15th overall) to Detroit in exchange for Washington\\'s first-round pick and Ottawa\\'s fifth-round pick both in 2021 (23rd and 138th overall) and this pick.\\nDetroit previously acquired this pick as the result of a trade on September 26, 2020, that sent future considerations to New York in exchange for Marc Staal and this pick.\\n The St. Louis Blues\\' second-round pick went to the Ottawa Senators as the result of a trade on July 24, 2021, that sent a second-round pick in 2021 (42nd overall) to Los Angeles in exchange for a fifth-round pick in 2021 (136th overall) and this pick.\\nLos Angeles previously acquired this as the result of a trade on February 19, 2020, that sent Alec Martinez to Vegas in exchange for a second-round pick in 2020 and this pick.\\nVegas previously acquired this pick as the result of a trade on June 28, 2019, that sent Colin Miller to Buffalo in exchange for a fifth-round pick in 2022 and this pick.\\nBuffalo previously acquired this pick as the result of a trade on July 1, 2018, that sent Ryan O\\'Reilly to St. Louis in exchange for Vladimir Sobotka, Patrik Berglund, Tage Thompson, a conditional first-round pick in 2019 or 2020 and this pick.\\n The Nashville Predators\\' second-round pick went to the Carolina Hurricanes as the result of a trade on July 23, 2021, that sent Carolina\\'s first-round pick (27th overall) in 2021 to Nashville in exchange for Los Angeles\\' second-round pick (40th overall) and this pick.\\n The Edmonton Oilers\\' second-round pick went to the New York Islanders as the result of a trade on July 16, 2021, that sent Nick Leddy to Detroit in exchange for Richard Panik and this pick.\\nDetroit previously acquired this pick as the result of a trade on February 24, 2020, that sent Andreas Athanasiou and Ryan Kuffner to Edmonton in exchange for Sam Gagner, a second-round pick in 2020 and this pick.\\n The Boston Bruins\\' second-round pick went to the Buffalo Sabres as the result of a trade on April 12, 2021, that sent Taylor Hall and Curtis Lazar to Boston in exchange for Anders Bjork and this pick.\\n The Carolina Hurricanes\\' second-round pick went to the Los Angeles Kings as the result of a trade on July 24, 2021, that sent a third-round pick and Calgary\\'s fourth-round pick both in 2021 (72nd and 109th overall) to Carolina in exchange for this pick.\\n The Colorado Avalanche\\'s second-round pick went to the Arizona Coyotes as the result of a trade on July 17, 2021, that sent future considerations to the New York Islanders in exchange for Andrew Ladd, a conditional second-round pick in 2022, a conditional third-round pick in 2023, and this pick.\\nThe Islanders previously acquired this pick as the result of a trade on October 12, 2020, that sent Devon Toews to Colorado in exchange for a second-round pick in 2022 and this pick.\\n The New York Islanders\\' second-round pick went to the Colorado Avalanche as the result of a trade on July 15, 2021, that sent Ryan Graves to New Jersey in exchange for Mikhail Maltsev and this pick.\\nNew Jersey previously acquired this pick as the result of a trade on February 16, 2020, that sent Andy Greene to New York in exchange for David Quenneville and this pick.\\n The Vegas Golden Knights\\' second-round pick went to the Chicago Blackhawks as the result of a trade on April 12, 2021, that sent Nick DeSimone and a fifth-round pick in 2022 to Vegas in exchange for a third-round pick in 2022 and this pick.\\n The Tampa Bay Lightning\\'s second-round pick went to the Montreal Canadiens as the result of a trade on October 7, 2020, that sent St. Louis\\' second-round pick in 2020 (57th overall) to Tampa Bay in exchange for a fourth-round pick in 2020 (124th overall) and this pick.\\n\\nRound three\\n\\nNotes\\n The Buffalo Sabres\\' third-round pick went to the New York Rangers as the result of a trade on July 1, 2019, that sent Jimmy Vesey to Buffalo in exchange for this pick.\\n The San Jose Sharks\\' third-round pick went to the St. Louis Blues as the result of a trade on July 24, 2021, that sent a third and sixth-round pick both in 2021 (81st and 177th overall) to San Jose in exchange for this pick.\\n The Los Angeles Kings\\' third-round pick went to the Nashville Predators as the result of a trade on July 24, 2021, that sent a third and fifth-round pick both in 2021 (83rd and 147th overall) to Carolina in exchange for this pick.\\nCarolina previously acquired this pick as the result of a trade on July 24, 2021, that sent a second-round pick in 2021 (59th overall) to Los Angeles in exchange for Calgary\\'s fourth-round pick in 2021 (109th overall) and this pick.\\n The Vancouver Canucks\\' third-round pick went to the Dallas Stars as the result of a trade on July 17, 2021, that sent Jason Dickinson to Vancouver in exchange for this pick.\\n The Arizona Coyotes\\' third-round pick went to the New York Rangers as the result of a trade on July 24, 2021, that sent a third and sixth-round pick both in 2021 (80th and 176th overall) to Washington in exchange for this pick.\\nWashington previously acquired this pick as the result of a trade on April 11, 2021, that sent a Jonas Siegenthaler to New Jersey in exchange for this conditional pick. The condition – Washington will receive Arizona\\'s third-round pick in 2021 at New Jersey\\'s choice, if the pick is available before the time of the selection – the date of conversion is unknown.\\nNew Jersey previously acquired this pick as the result of a trade on December 16, 2019, that sent Taylor Hall and Blake Speers to Arizona in exchange for Nick Merkley, Kevin Bahl, Nate Schnarr, a conditional first-round pick in 2020 and this pick (being conditional at the time of the trade). The condition – New Jersey will receive a third-round pick in 2021 if Arizona does not advance to the 2020 Western Conference Second Round and Hall does not re-sign with Arizona for the 2020–21 NHL season – was converted when Arizona was eliminated in the First Round of the playoffs on August 19, 2020 and when Hall signed with the Buffalo Sabres on October 11, 2020.\\n The Chicago Blackhawks\\' third-round pick went to the Anaheim Ducks as the result of a trade on July 24, 2021, that sent a third-round pick in 2022 to Montreal in exchange for this pick.\\nMontreal previously acquired this pick as the result of a trade on June 30, 2019, that sent Andrew Shaw and a seventh-round pick in 2021 to Chicago in exchange for second and seventh-round picks both in 2020 and this pick.\\n The New York Rangers\\' third-round pick went to the Washington Capitals as the result of a trade on July 24, 2021, that sent Arizona\\'s third-round pick in 2021 (75th overall) to New York in exchange for a sixth-round pick in 2021 (176th overall) and this pick.\\n The St. Louis Blues\\' third-round pick went to the San Jose Sharks as the result of a trade on July 24, 2021, that sent a third-round pick in 2021 (71st overall) to St. Louis in exchange for a sixth-round pick in 2021 (177th overall) and this pick.\\n The Nashville Predators\\' third-round pick went to the Carolina Hurricanes as the result of a trade on July 24, 2021, that sent Los Angeles\\' third-round pick in 2021 (72nd overall) to Nashville in exchange for a fifth-round pick in 2021 (147th overall) and this pick.\\n The Edmonton Oilers\\' third-round pick went to the Los Angeles Kings as the result of a trade on July 24, 2021, that sent Toronto\\'s third-round pick and a sixth-round pick both in 2021 (89th and 168th overall) to Calgary in exchange for this pick.\\nCalgary previously acquired this pick as the result of a trade on July 19, 2019, that sent James Neal to Edmonton in exchange for Milan Lucic and this pick (being conditional at the time of the trade). The condition – Calgary will receive a third-round pick in 2020 or 2021 at Edmonton\\'s choice, after the league made a ruling on this conditional pick on July 31, 2020. The original condition on this pick was that Calgary will receive a 2020 third-round pick if Neal scores at least 21 goals during the 2019–20 NHL season and Lucic has at least ten fewer goals than Neal – was converted when the Oilers elected to keep their 2020 third-round pick on October 7, 2020.\\n The Washington Capitals\\' third-round pick went to the Montreal Canadiens as the result of a trade on October 7, 2020, that sent Anaheim\\'s fourth-round pick in 2020 (98th overall) to San Jose in exchange for this pick.\\nSan Jose previously acquired this pick as the result of a trade on February 18, 2020, that sent Brenden Dillon to Washington in exchange for Colorado\\'s second-round pick in 2020 and this pick (being conditional at the time of the trade). The condition – San Jose will receive a third-round pick in 2021 if Washington does not win the Stanley Cup in 2020 – was converted when Washington was eliminated from the 2020 Stanley Cup playoffs on August 20, 2020.\\n The Florida Panthers\\' third-round pick went to the Buffalo Sabres as the result of a trade on April 10, 2021, that sent Brandon Montour to Florida in exchange for this pick.\\n The Toronto Maple Leafs\\' third-round pick went to the Calgary Flames as the result of a trade on July 24, 2021, that sent Edmonton\\'s third-round pick in 2021 (84th overall) to Los Angeles in exchange for a sixth-round pick in 2021 (168th overall) and this pick.\\nLos Angeles previously acquired this pick as the result of a trade on February 5, 2020, that sent Jack Campbell and Kyle Clifford to Toronto in exchange for Trevor Moore, Columbus\\' third-round pick in 2020 and this pick (being conditional at the time of the trade). The condition – Los Angeles will receive a third-round pick in 2021 if Clifford does not re-sign with Toronto for the 2020–21 NHL season – was converted when Clifford signed with St. Louis.\\n The Pittsburgh Penguins\\' third-round pick went to the Edmonton Oilers as the result of a trade on July 23, 2021, that sent a first-round pick in 2021 (20th overall) to Minnesota in exchange for a first-round pick in 2021 (22nd overall) and this pick.\\nMinnesota previously acquired this pick as the result of a trade on October 5, 2020, that sent Ryan Donato to San Jose in exchange for this pick.\\nSan Jose previously acquired this pick as the result of a trade on February 24, 2020, that sent Patrick Marleau to Pittsburgh in exchange for this pick (being conditional at the time of the trade). The condition – San Jose will receive a third-round pick in 2021 if Pittsburgh does not win the Stanley Cup in 2020 – was converted when the Penguins were eliminated from the 2020 Stanley Cup playoffs on August 7, 2020.\\n The Carolina Hurricanes\\' third-round pick went to the Chicago Blackhawks as the result of a trade on July 24, 2021, that sent a third-round pick in 2022 to Carolina in exchange for this pick.\\n The Vegas Golden Knights third-round pick went to the Carolina Hurricanes as the result of a trade on July 22, 2021, that sent Alex Nedeljkovic to Detroit in exchange for Jonathan Bernier and this pick.\\nDetroit previously acquired this pick as the result of a trade on February 26, 2018, that sent Tomas Tatar to Vegas in exchange for a first-round pick in 2018, the Islanders\\' second-round pick in 2019 and this pick.\\n The Montreal Canadiens\\' third-round pick went to the Buffalo Sabres as the result of a trade on March 26, 2021, that sent Eric Staal to Montreal in exchange for a fifth-round pick in 2021 and this pick.\\n\\nRound four\\n\\nNotes\\n The Detroit Red Wings\\' fourth-round pick went to the Vegas Golden Knights as the result of a trade on July 24, 2021, that sent Winnipeg\\'s fourth-round pick and Carolina\\'s fifth-round pick both in 2021 (114th and 155th overall) to Detroit in exchange for this pick.\\n The Los Angeles Kings\\' fourth-round pick went to the New York Rangers as the result of a trade on March 27, 2021, that sent Brendan Lemieux to Los Angeles in exchange for this pick.\\n The Vancouver Canucks\\' fourth-round pick went to the Chicago Blackhawks as the result of a trade on April 12, 2021, that sent Madison Bowey and a fifth-round pick in 2021 to Vancouver in exchange for this pick.\\n The Ottawa Senators\\' fourth-round pick went to the New York Rangers as the result of a trade on October 7, 2019, that sent Vladislav Namestnikov to Ottawa in exchange for Nick Ebert and this pick.\\n The Calgary Flames\\' fourth-round pick went to the Carolina Hurricanes as the result of a trade on July 24, 2021, that sent a second-round pick in 2021 (59th overall) to Los Angeles in exchange for a third-round pick in 2021 (72nd overall) and this pick.\\nLos Angeles previously acquired this pick as the result of a trade on February 24, 2020, that sent Derek Forbort to Calgary in exchange for this pick (being conditional at the time of the trade). The condition – Los Angeles will receive a fourth-round pick in 2021 if Forbort does not re-sign with Calgary for the 2020–21 NHL season – was converted when Forbort signed with the Winnipeg Jets on October 11, 2020.\\n The St. Louis Blues\\' fourth-round pick went to the Montreal Canadiens as the result of a trade on February 18, 2020, that sent Marco Scandella to St. Louis in exchange for a second-round pick in 2020 and this pick (being conditional at the time of the trade). The condition – Montreal will receive a fourth-round pick in 2021 if Scandella re-signs with the Blues for the 2020–21 NHL season by October 7, 2020 – was converted when Scandella re-signed with the Blues on April 16, 2020.\\n The Winnipeg Jets\\' fourth-round pick went to the Detroit Red Wings as the result of a trade on July 24, 2021, that sent a fourth-round pick in 2021 (102nd overall) to Vegas in exchange for Carolina\\'s fifth-round pick in 2021 (155th overall) and this pick.\\nVegas previously acquired this pick as the result of a trade on February 21, 2020, that sent Cody Eakin to Winnipeg in exchange for this pick (being conditional at the time of the trade). The condition – Vegas will receive a fourth-round pick in 2021 if Eakin does not re-sign with the Jets for the 2020–21 NHL season – was converted when Eakin signed with the Buffalo Sabres for the 2020–21 NHL season on October 10, 2020.\\n The Toronto Maple Leafs\\' fourth-round pick went to the San Jose Sharks as the result of a trade on April 11, 2021, that sent Nick Foligno to Toronto in exchange for this pick.\\n The Pittsburgh Penguins\\' fourth-round pick went to the Arizona Coyotes as the result of a trade on June 29, 2019, that sent Alex Galchenyuk and Pierre-Olivier Joseph to Pittsburgh in exchange for Phil Kessel, Dane Birks and this pick.\\n The Carolina Hurricanes\\' fourth-round pick went to the Ottawa Senators as the result of a trade on July 24, 2021, that sent Los Angeles\\' fifth-round pick and a sixth-round pick both in 2021 (136th and 170th overall) to Carolina in exchange for this pick.\\n The Colorado Avalanche\\'s fourth-round pick went to the Nashville Predators as the result of a trade on October 10, 2020, that sent Austin Watson to Ottawa in exchange for this pick.\\nOttawa previously acquired this pick as the result of a trade on February 24, 2020, that sent Vladislav Namestnikov to Colorado in exchange for this pick.\\n The Vegas Golden Knights\\' fourth-round pick went to the Tampa Bay Lightning as the result of a trade on July 24, 2021, that sent a fourth-round pick in 2022 to Montreal in exchange for this pick.\\nMontreal previously acquired this pick as the result of a trade on February 24, 2020, that sent Nick Cousins to Vegas in exchange for this pick.\\n The Montreal Canadiens\\' fourth-round pick went to the Minnesota Wild as the result of a trade on July 24, 2021, that sent a fifth and seventh-round pick both in 2021 (150th and 214th overall) to Montreal in exchange for this pick.\\n The Tampa Bay Lightning\\'s fourth-round pick went to the Vegas Golden Knights as the result of a trade on July 24, 2021, that sent New Jersey\\'s second-round pick in 2021 (36th overall) to Detroit in exchange for a second-round pick in 2021 (38th overall) and this pick.\\nDetroit previously acquired this pick as the result of a trade on April 10, 2021, that sent David Savard to Tampa Bay in exchange for this pick.\\n\\nRound five\\n\\nNotes\\n The Buffalo Sabres\\' fifth-round pick went to the New Jersey Devils as the result of a trade on February 24, 2020, that sent Wayne Simmonds to Buffalo in exchange for this pick (being conditional at the time of the trade). The condition – New Jersey will receive a fifth-round pick in 2021 if the Sabres do not qualify for the 2020 Stanley Cup playoffs – was converted on May 26, 2020, when it was announced the Sabres would not participate in the 2020 Stanley Cup playoffs.\\n The New Jersey Devils\\' fifth-round pick went to the Columbus Blue Jackets as the result of a trade on October 8, 2020, that sent Ryan Murray to New Jersey in exchange for this pick.\\n The Los Angeles Kings\\' fifth-round pick went to the Carolina Hurricanes as the result of a trade on July 24, 2021, that sent a fourth-round pick in 2021 (123rd overall) to Ottawa in exchange for a sixth-round pick in 2021 (170th overall) and this pick.\\nOttawa previously acquired this pick as the result of a trade on July 24, 2021, that sent a second-round pick in 2021 (42nd overall) to Los Angeles in exchange for St. Louis\\' second-round pick in 2021 (49th overall) and this pick.\\n The Ottawa Senators\\' fifth-round pick went to the Dallas Stars as the result of a trade on July 23, 2021, that sent a first-round pick in 2021 (15th overall) to Detroit in exchange for Washington\\'s first-round pick and the Rangers\\' second-round pick both in 2021 (23rd and 48th overall) and this pick.\\nDetroit previously acquired this pick as the result of a trade on April 11, 2021, that sent Jon Merrill to Montreal in exchange for Hayden Verbeek and this pick.\\nMontreal previously acquired this pick as the result of a trade on January 2, 2020, that sent Mike Reilly to Ottawa in exchange for Andrew Sturtz and this pick.\\n The Chicago Blackhawks\\' fifth-round pick went to the Vancouver Canucks as the result of a trade on April 12, 2021, that sent a fourth-round pick in 2021 to Chicago in exchange for Madison Bowey and this pick.\\n The Philadelphia Flyers\\' fifth-round pick went to the Montreal Canadiens as the result of a trade on February 24, 2020, that sent Nate Thompson to Philadelphia in exchange for this pick.\\n The Nashville Predators\\' fifth-round pick went to the Carolina Hurricanes as the result of a trade on July 24, 2021, that sent Los Angeles\\' third-round pick in 2021 (72nd overall) to Nashville in exchange for a third-round pick in 2021 (83rd overall) and this pick.\\n The Edmonton Oilers\\' fifth-round pick went to the Anaheim Ducks as the result of a trade on October 8, 2020, that sent Erik Gudbranson to Ottawa in exchange for this pick.\\nOttawa previously acquired this pick as the result of a trade on February 24, 2020 that sent Tyler Ennis to Edmonton in exchange for this pick.\\n The Minnesota Wild\\'s fifth-round pick went to the Montreal Canadiens as the result of a trade on July 24, 2021, that sent a fourth-round pick in 2021 (127th overall) to Minnesota in exchange for a seventh-round pick in 2021 (214th overall) and this pick.\\n The Carolina Hurricanes\\' fifth-round pick went to the Detroit Red Wings as the result of a trade on July 24, 2021, that sent a fourth-round pick in 2021 (102nd overall) to Vegas in exchange for Winnipeg\\'s fourth-round pick in 2021 (114th overall) and this pick.\\nVegas previously acquired this pick as the result of a trade on June 26, 2019, that sent Erik Haula to Carolina in exchange for Nicolas Roy and this pick (being conditional at the time of the trade). The condition – Vegas will receive a fifth-round pick in 2021 if Carolina trades Haula for a player, multiple draft picks or if he is traded for a draft pick in the first five rounds of any future draft – was converted when Haula was traded to the Florida Panthers on February 24, 2020.\\n The Colorado Avalanche\\'s fifth-round pick went to the San Jose Sharks as the result of a trade on April 10, 2021, that sent Devan Dubnyk to Colorado in exchange for Greg Pateryn and this pick.\\n The Vegas Golden Knights\\' fifth-round pick went to the Philadelphia Flyers as the result of a trade on April 12, 2021, that sent Michael Raffl to Washington in exchange for this pick.\\nWashington previously acquired this as the result of a trade on December 2, 2019, that sent Chandler Stephenson to Vegas in exchange for this pick.\\n The Montreal Canadiens\\' fifth-round pick went to the Buffalo Sabres as the result of a trade on March 26, 2021, that sent Eric Staal to Montreal in exchange for a third-round pick in 2021 and this pick.\\n\\nRound six\\n\\nNotes\\n The Los Angeles Kings\\' sixth-round pick went to the Calgary Flames as the result of a trade on July 24, 2021, that sent Edmonton\\'s third-round pick in 2021 (84th overall) to Los Angeles in exchange for Toronto\\'s third-round pick in 2021 (89th overall) and this pick.\\n The Ottawa Senators\\' sixth-round pick went to the Carolina Hurricanes as the result of a trade on July 24, 2021, that sent a fourth-round pick in 2021 (123rd overall) to Ottawa in exchange for Los Angeles\\' fifth-round pick in 2021 (136th overall) and this pick.\\n The New York Rangers\\' sith-round pick went to the Washington Capitals as the result of a trade on July 24, 2021, that sent Arizona\\'s third-round pick in 2021 (75th overall) to New York in exchange for a third-round pick in 2021 (80th overall) and this pick.\\n The St. Louis Blues\\' sixth-round pick went to the San Jose Sharks as the result of a trade on July 24, 2021, that sent a third-round pick in 2021 (71st overall) to St. Louis in exchange for a third-round pick in 2021 (81th overall) and this pick.\\n The Winnipeg Jets\\' sixth-round pick went to the Vancouver Canucks as the result of a trade on April 12, 2021, that sent Jordie Benn to Winnipeg in exchange for this pick.\\n The Pittsburgh Penguins\\' sixth-round pick went to the Edmonton Oilers as the result of a trade on July 26, 2019, that sent John Marino to Pittsburgh in exchange for this pick (being conditional at the time of the trade). The condition – Edmonton will receive a sixth-round pick in 2021 if Marino signs with the Penguins – was converted when Marino signed with the Penguins on August 8, 2019.\\n The Colorado Avalanche\\'s sixth-round pick went to the Buffalo Sabres as the result of a trade on March 20, 2021, that sent Jonas Johansson to Colorado in exchange for this pick.\\n\\nRound seven\\n\\nNotes\\n The Anaheim Ducks\\' seventh-round pick went to the Pittsburgh Penguins as the result of a trade on October 25, 2019, that sent Erik Gudbranson to Anaheim in exchange for Andreas Martinsen and this pick.\\n The New Jersey Devils\\' seventh-round pick went to the Tampa Bay Lightning as the result of a trade on November 1, 2019, that sent Louis Domingue to New Jersey in exchange for this pick (being conditional at the time of the trade). The condition – Tampa Bay will receive a seventh-round pick in 2021 if Domingue plays in seven games for the Devils during the 2019–20 NHL season – was converted on January 9, 2020.\\n The Detroit Red Wings\\' seventh-round pick went to the St. Louis Blues as the result of a trade on October 7, 2020, that sent Chicago\\'s seventh-round pick in 2020 (203rd overall) to Detroit in exchange for this pick.\\n The Los Angeles Kings\\' seventh-round pick went to the Carolina Hurricanes as the result of a trade on October 7, 2020, that sent Montreal\\'s fifth-round pick in 2020 (140th overall) to Los Angeles in exchange for a sixth-round pick in 2020 and this pick.\\n The Arizona Coyotes\\' seventh-round pick went to the New Jersey Devils as the result of a trade on October 7, 2020, that sent a seventh-round pick in 2020 (192nd overall) to Arizona in exchange for this pick.\\n The St. Louis Blues\\' seventh-round pick went to the Carolina Hurricanes as the result of a trade on September 24, 2019, that sent Justin Faulk and a fifth-round pick in 2020 to St. Louis in exchange for Joel Edmundson, Dominik Bokk and this pick.\\n The Winnipeg Jets\\' seventh-round pick went to the Florida Panthers as the result of a trade on February 25, 2019, that sent Bogdan Kiselevich to Winnipeg in exchange for this pick.\\n The Nashville Predators\\' seventh-round pick went to the Tampa Bay Lightning as the result of a trade on June 14, 2019, that sent Connor Ingram to Nashville in exchange for this pick.\\n The Minnesota Wild\\'s seventh-round pick went to the Montreal Canadiens as the result of a trade on July 24, 2021, that sent a fourth-round pick in 2021 (127th overall) to Minnesota in exchange for a fifth-round pick in 2021 (150th overall) and this pick.\\n The Washington Capitals\\' seventh-round pick went to the Pittsburgh Penguins as the result of a trade on October 7, 2020, that sent Colorado\\'s seventh-round pick in 2020 (211th overall) to Washington in exchange for this pick.\\n The Florida Panthers\\' seventh-round pick went to the Chicago Blackhawks as the result of a trade on April 8, 2021, that sent Lucas Carlsson and Lucas Wallmark to Florida in exchange for Brett Connolly, Riley Stillman, Henrik Borgstrom and this pick.\\n The Toronto Maple Leafs\\' seventh-round pick went to the Boston Bruins as the result of a trade on October 7, 2020, that sent a seventh-round pick in 2020 (213th overall) to Toronto in exchange for this pick.\\n The Montreal Canadiens\\' seventh-round pick went Arizona Coyotes as the result of a trade on July 24, 2021, that sent St. Louis\\' seventh-round pick in 2022 to Montreal in exchange for this pick.\\nMontreal previously re-acquired this pick as the result of a trade on October 7, 2020, that sent Ottawa\\'s seventh-round pick in 2020 to Chicago in exchange for this pick.\\nChicago previously acquired this pick as the result of a trade on June 30, 2019, that sent second and seventh-round picks both in 2020 and a third-round pick in 2021 to Montreal in exchange for Andrew Shaw and this pick.\\n\\nDraftees based on nationality\\n\\nNorth American draftees by state/province\\n\\nSee also\\n 2017–18 NHL transactions\\n 2018–19 NHL transactions\\n 2019–20 NHL transactions\\n 2020–21 NHL transactions\\n 2021–22 NHL transactions\\n 2020–21 NHL season\\n 2021 NHL Expansion Draft\\n List of first overall NHL draft picks\\n List of NHL players\\n\\nReferences\\n\\nExternal links\\n2021 NHL Entry Draft player stats at The Internet Hockey Database\\n\\nEntry Draft\\nNHL Entry Draft\\nNHL Entry Draft\\nEvents in New Jersey\\nIce hockey in New Jersey\\nNHL\\nNational Hockey League in the New York metropolitan area\\nNational Hockey League Entry Draft'},\n",
+       "  {'docid': 'doc-en-15',\n",
+       "   'text': 'The history of Christianity in Sussex includes all aspects of the Christianity in the region that is now Sussex from its introduction to the present day.  Christianity is the most commonly practised religion in Sussex.\\n\\nEarly history\\n\\nAfter the Roman conquest of AD 43, the Celtic society of Sussex became heavily Romanized.\\n\\nThe first written account of Christianity in Britain comes from the early Christian Berber author, Tertullian, writing in the third century, who said that \"Christianity could even be found in Britain.\" Emperor Constantine (AD\\xa0306-337), granted official tolerance to Christianity with the Edict of Milan in AD\\xa0313. Then, in the reign of Emperor Theodosius \"the Great\" (AD\\xa0378–395), Christianity was made the official religion of the Roman Empire.\\n\\nWhen Roman rule eventually ceased, Christianity was probably confined to urban communities.  At Wiggonholt, on a tributary of the River Arun, a large lead tank with repeated chi-rho motifs was discovered in 1943, the only Roman period artefact in Sussex found with a definite Christian association.  It may represent a baptismal font or a container for holy water, or alternatively may have been used by pagans.\\n\\nMedieval\\n\\nSaxon\\n\\nAfter the departure of the Roman army, the Saxons arrived and founded the Kingdom of Sussex in the 5th century, bringing with them their polytheistic religion. The Saxon pagan culture probably caused a reversal of the spread of Christianity.   According to Bede, Sussex was the last of the mainland Anglo Saxon kingdoms to be converted.\\n\\nÆðelwealh became Sussex\\'s first Christian king when he married Eafe, the daughter of Wulfhere, the Christian king of Mercia. In 681 St Wilfrid, the exiled Bishop of York, landed at Selsey and is credited with evangelising the local population and founding the church in Sussex. King Æðelwealh granted land to Wilfrid which became the site of Selsey Abbey. The seat of the Sussex bishopric was originally located here before the Normans moved it to Chichester Cathedral in 1075. According to Bede, Sussex was the last area of the country to be converted. However it is unlikely that Sussex was wholly heathen when Wilfrid arrived.  Æðelwealh, Sussex\\'s king, had been baptised. Damianus, a South Saxon, was made Bishop of Rochester in the Kingdom of Kent in the 650s; this may indicate earlier missionary work in the first half of the 7th century. At the time of Wilfrid\\'s mission there was a monastery at Bosham containing a few monks led by an Irish monk named Dicul, which was probably part of the Hiberno-Scottish mission of the time. Wilfrid was a champion of Roman customs and it was these customs that were adopted by the church in Sussex rather than the Celtic customs that had taken root in Scotland and Ireland.\\n\\nShortly after Æðelwealh granted land to Wilfrid for the church, Cædwalla of Wessex killed Æðelwealh and conquered Sussex. Christianity in Sussex was put under control of the diocese of Winchester. It was not until c. 715 that Eadberht, Abbot of Selsey was consecrated the first bishop of the South Saxons.\\n\\nSt Lewinna, or St Leofwynn, was a female saint who lived around Seaford, probably at Bishopstone around the 7th century. According to the hagiography of the Secgan Manuscript, Lyminster is the burial place of St Cuthflæd of Lyminster. In the late 7th or early 8th century, St Cuthman, a shepherd who may have been born in Chidham and had been reduced to begging, set out from his home with his disabled mother using a one-wheeled cart.  When he reached Steyning he saw a vision and stopped there to build a church. Cuthman was venerated as a saint and his church was in existence by 857 when King Æthelwulf of Wessex was buried there. Steyning was an important religious centre and St Cuthman\\'s grave became a place of pilgrimage in the 10th and 11th centuries. In 681, Bede records that an outbreak of the plague had devastated parts of England, including Sussex, and the monks at Selsey Abbey fasted and prayed for three days for an end to the outbreak. A young boy with the plague prayed to St Oswald and his prayers were answered, and a vision of St Peter and St Paul was said to have appeared to the boy, telling him that he would be the last to die.\\n\\nThe church built at Steyning was one of around 50 minster churches across Sussex and these churches supplied itinerant clergy to surrounding districts. Other examples are churches at Singleton, Lyminster, Findon and Bishopstone. The jurisdiction of each minster church in the pre-Viking era seems to match early land divisions that were replaced by hundreds in the 10th or 11th centuries. It was not until 200–300 years after its conversion to Christianity in the 680s that a network of local parish churches existed in Sussex.\\n\\nVarious monastic houses were established in the Saxon period in Sussex including at Selsey Abbey, Lyminster Priory, Aldingbourne, Beddingham, Bosham, Chichester, Ferring and South Malling, near Lewes.\\n\\nNorman and Angevin\\n\\nFollowing the Norman Conquest of 1066, there was a purge of the English episcopate in 1070. The Anglo-Saxon Bishop of Selsey was deposed and replaced with William the Conqueror\\'s personal chaplain, Stigand. During Stigand\\'s episcopate the see that had been established at Selsey was transferred to Chichester after the Council of London of 1075 decreed that sees should be centred in cities rather than vills. 1094 saw the completion of Battle Abbey, which had been founded on the site of the Battle of Hastings after Pope Alexander II had ordered the Normans to do penance for killing so many people during their conquest of England.  Monks also planned out the nearby town of Battle shortly after the conquest.  Many of the monastic houses of this period were founded by Sussex\\'s new Norman lords. Around 1081, the lord of Lewes Rape, William de Warenne and his wife Gundrada formed England\\'s first and largest Cluniac monastery at Lewes Priory. The lord of Arundel Rape, Roger de Montgomerie established Arundel Priory in 1102. Sele Priory in the Rape of Bramber was founded by the Braose family by 1126.\\n\\nBishop Ralph Luffa is credited with the foundation of the current Chichester Cathedral. \\nThe original structure that had been built by Stigand was largely destroyed by fire in 1114.\\n\\nThe medieval church also set up various hospitals and schools in Sussex, including St Mary\\'s Hospital in Chichester (c. 1290-1300); St Nicholas\\' Hospital in Lewes, which was run by the monks of Lewes Priory; and the Prebendal School close to Chichester Cathedral.\\n\\nThe archdeaconries of Chichester and Lewes were created in the 12th century under Ralph Luffa.\\n\\nSussex has strong links with the Knights Templar and the Knights Hospitaller including at Shipley, Poling and Sompting.\\n\\nIn the 13th century, Richard of Chichester was canonised as a saint, and a shrine dedicated to him at Chichester Cathedral became an important place of pilgrimage. St Richard later became Sussex\\'s patron saint.\\n\\nIn 1450 Adam Moleyns became the first and only bishop of Chichester to be assassinated. Troops had been gathered to send to the war in France, but bad weather delayed their departure, and troops raided several towns along the coast. Moleyns was sent to Portsmouth to pay troops their outstanding wages, but was beaten so severely by the mob of soldiers that he died.\\n\\nThere is very little evidence of Lollardy in Sussex in the 15th century. Only one person was burnt to death as a Lollard, Thomas Bageley. Goring argues that pockets of Lollardy existed in the High Weald for over a century before Henry VIII\\'s break with Rome. Lollards tended to congregate near diocesan boundaries so that they could flee across the boundary to safety. Reginald Pecock, bishop of Chichester from 1450–1459, was accused of heresy and only saved his life by privately and publicly renouncing his opinions.\\n\\nEarly modern\\nDuring this period Sussex has been described \"as an anomaly: a southern county with a religious dynamic more in keeping with those of the north, connected to the Continent as much as the rest of the country, an entity that resisted easy co-option into Elizabeth I\\'s \\'little Israel of England\\'.\"  Rye was probably the most Protestant of all Sussex towns, gaining a reputation as a \\'godly commonwealth\\' well before the end of Henry VIII\\'s reign.   There was also strong opposition to the imposition of mass by Mary I.\\n\\nThe Reformation\\n\\nAs in the rest of the country, the Church of England\\'s split with Rome during the reign of Henry VIII was felt in Sussex. In 1535, the king appointed Sir Thomas Cromwell as vicar-general. Cromwell visited Sussex later in 1535, as part of his national census of churches and monasteries. The census was intended to enable the more efficient taxing of church property. The following year, an Act was passed that decreed the dissolution of monasteries with an income of less than £200 per annum. This first phase was followed by the \"voluntary\" surrenders of the larger houses. Lewes Priory with Battle, was the first house in England, during the Dissolution, to surrender on a voluntary basis. The monks surrendered the house in November 1537 in return for either being given a small pension or a living as a priest. The site and possessions of Lewes Priory were granted to Henry VIII\\'s vicar-general, Thomas Cromwell, who passed Lewes Priory to his son, Gregory Cromwell. Sussex did not do too badly compared to the rest of the country, as it only had one person in 500 who was a member of a religious order, compared to the national average of one in 256.\\n \\nIn 1538 there was a royal order for the demolition of the shrine of St Richard of Chichester in Chichester Cathedral. Thomas Cromwell saying that there was \"a certain kind of idolatry about the shrine\".\\n\\nRichard Sampson, Bishop of Chichester, incurred the displeasure of Cromwell and ended up imprisoned in the Tower of London at the end of 1539. Sampson was released after Cromwell\\'s fall from favour and execution in 1540. Sampson then continued at the see of Chichester for a further two years. He was succeeded as Bishop of Chichester by George Day. Day opposed the changes, and incurred the displeasure of the royal commissioners, who promptly suspended him as Bishop and allowed him only to preach in his cathedral church.\\n\\nHenry VIII died in 1547; his son Edward VI continued on the path that his father had set. However his reign was only short-lived as he died after only six years.\\n\\nThe bishops of Chichester had not been in favour of the Reformation until the appointment of John Scory to the episcopate in 1552. During Henry VIII\\'s reign two of the canons of Chichester Cathedral had been executed for their opposition to the Reformation, and during Edward VI\\'s reign George Day was ultimately imprisoned for his opposition to the reforms.\\n\\nReign of Mary I\\nThere had been twenty years of religious reform when the Catholic, Mary Tudor succeeded to the throne of England in 1553. Mary expected her clergy to be unmarried, so Bishop Scory thought it prudent to retire as he was a married man, and George Day was released and restored to the see of Chichester.\\n\\nMary\\'s persecution of Protestants earned her the nickname \"Bloody Mary\". Nationally about 288 Protestants were burnt at the stake during her reign, including 41 in Sussex. Most of the executions in Sussex were at Lewes. Of these 41 burnings, 36 can be identified to have come from specific parishes, and the place of execution is known for 27 of them; because the details of the executions were recorded in the Book of Martyrs by John Foxe, published in 1563. Martyrs included Deryck Carver, a French-speaking Flemish man who had sought refuge in Brighton from persecution for his Calvinist beliefs; and Richard Woodman, an ironmaster from Buxted. There are Bonfire Societies in Sussex that still remember the 17 Protestant martyrs that burned in Lewes High Street, and in Lewes itself they have a procession of martyrs\\' crosses during the bonfire night celebration. According to Quinn, the authorities in Sussex during Mary\\'s reign were rather less bloodthirsty than is generally assumed, often allowing their opponents to slip the noose when they could. Carver\\'s meetings had been attended by many fishermen from both England and France, beginning the tradition of French Christian worship in Brighton.\\n\\nThere was a range of Protestant beliefs in Sussex during the reign of Queen Mary.  Sussex\\'s proximity to the Continent left it particularly exposed to European Protestantism, while its proximity to large parts of the Weald also left it open to pre-Reformation Protestantism. This was particularly so in the east of the county, with its trade links to Protestant areas of northern Europe and it covering a large part of the Weald, as well as being close to the Kentish border.\\n\\nReign of Elizabeth I\\nWhen Mary died in 1558, she was replaced by her Protestant sister Elizabeth I. Elizabeth re-established the break with Rome when she passed the 1559 Acts of Supremacy and Uniformity: the clergy were expected to take statutory oaths, and those that did not were deprived of their living. In the county nearly half the cathedral clergy and about 40% of the parish clergy had to be replaced, although some of the vacancies were due to ill health or death.\\n\\nA case can be made for the Reformation as a religious phenomenon only arriving in Sussex with Bishop Richard Curteys from 1570.  In the west, Curteys\\' reforms were hampered by the noble Catholic families, and in the east by more radical forms of Protestantism.  Until then the loyal but conservative bishops Sherborne, Sampson and Day did not appear to enforce doctrinal orthodoxy. Through the influence of Richard Curteys, the Reformation in Sussex took on a Puritan tone from the 1570s and a tradition of \\'radical parochialism\\' developed with well-educated preachers supporting ministers, often sponsored by Puritan landowners.  Curteys circumvented the existing clergy by bringing in \\'lecturers\\' or unbeneficed clergy who provided a new preaching tradition, and also gathered some existing clergy who were sympathetic to his aims.  This was particularly strong in the Lewes area, in part because of its European trade links.\\n\\nDuring the 1570s Puritan Christian names like \"Feregod\" became common in the Weald. Far from the seat of the Bishop of Chichester, radical towns like Rye and Lewes became \"free-thinking\" Protestant towns, and numbers of Protestants increased, with Huguenots seeking refuge after the St Bartholomew\\'s Day massacre in France. In the 1560s and 1570s, there was a trend for giving Puritan children \"godly\" names, especially in East Sussex, signifying a Puritan counter-culture. Eighteen parishes in the east of Sussex record Puritan names, the highest concentration of which was in Warbleton, where around half the children were given Puritan names between 1587 and 1590. Such Puritan names included \"Be-courteous Cole\" (in Pevensey), \"Safely-on-High Snat\" (in Uckfield) and \"Fight-the-Good-Fight-of-Faith White\" (in Ewhurst. One child with a Puritan name, Accepted Frewen, later became Archbishop of York. Many Sussex Puritans emigrated across the Atlantic Ocean to New England, accounting for about 1% of New England\\'s immigrants. Puritan migrants from other English regions, such as East Anglia, had much lower usage of hortatory names, and Puritans in the US state of Massachusetts followed the East Anglian rather than the Sussex naming custom.\\n\\nIn the late 16th century, Sussex was a complicated and divided region. The countryside was largely Catholic, dominated by the ancient Catholic families: the Howards at Arundel, the Percys at Petworth House, the Gages at Firle, the Brownes (the Lords Montague) at Cowdray Park, the Palmers at Parham House, as well as other minor dynasties like the Carylls, Lewkenors, Shelleys and Kemps. At the start of Elizabeth\\'s reign all six of Sussex\\'s noble families were Catholic. The towns, including Rye and Lewes, were more likely to be controlled by Protestants if not Protestant in orientation. The Earl of Arundel, Henry FitzAlan had considerable influence as Lord Steward of the Royal Household, privy councillor and Lord Lieutenant of Sussex (1559-1569) until he was involved in the Ridolfi plot to marry his son-in-law, Thomas, Duke of Norfolk, to Mary Queen of Scots. Even after the 1580s when restrictions on Catholics were imposed, Sussex continued to be led by Catholic peers. The office of sheriff of Sussex was held by Catholics eleven times between 1558 and 1603.\\n\\nAt the end of Elizabeth\\'s reign, Catholicism continued to be tolerated. On the death of her husband, Lady Montague withdrew to Battle Abbey, the family\\'s seat in the east of the county. The establishment of what became known as \"Little Rome\" became a focal point for the local Catholic community, with as many as 120 people attending Mass. This shows that long-standing political loyalty by Catholics was repaid by a form of toleration.\\n\\nThe Catholic Sussex families which suffered imprisonment or financial ruin at this time were mostly those that were involved in conspiracies against Elizabeth. After the uprising of 1569, the eighth Earl of Northumberland was effectively sent into internal exile in Sussex, at his home at Petworth House. After 1577, central authorities mounted on a growing attack on Catholic recusants, forcing them to abandon apparent conformity at a greater cost. Fines for non-attendance at an Anglican church were increased from 12d per week to 20 pounds per month. In 1580 leading Sussex Catholics including John Gage of Firle and Richard Shelley of Warminghurst were imprisoned for recusancy and continued to pay the taxes and fines demanded. In 1583 Charles Paget was smuggled into England, meeting William Shelley at Patching to discuss a plan to land Spanish, German and Italian troops in Sussex and march to Petworth House, the home of Northumberland, and Arundel Castle, while a second force would land in Lancashire and be joined by an uprising of English Catholics. Shelley\\'s and Northumberland\\'s actions reveal there was some truth in the suspicions directed against Sussex Catholics.\\n\\nWith further legislation in the 1580s, Sussex Catholics caught harbouring priests were guilty of treason. Significantly, no member of the Sussex gentry or nobility was ever charged under these laws, and neither was there ever any uprising, even though there was a significant Catholic community in Sussex. In this, the west of Sussex was out of step with the rest of England, just as attempts to impose a \"Godly magistracy\" in Rye in the east of the county was out of step with the rest of Protestant England. During this period Sussex was often different from the rest of England, with east and west of the county often inversions of each other. West Grinstead Park, home of the Caryll family, became a Roman Catholic mission where priests arrived, generally at night up the River Adur to await \"posting\". he River Adur was extensively used by the many Catholics travelling covertly between London and the Continent. Thomas Pilchard was executed in 1587 for being a priest and Edward Shelley of Warminghurst died at Tyburn in London in 1588 for hiding a priest. In 1588 two Catholic priests, Ralph Crockett and Edward James, were arrested at Arundel Haven (now Littlehampton), taken to London and executed outside Chichester. Philip Howard, 20th Earl of Arundel, who was canonised in 1970 as one of the Forty Martyrs of England and Wales, spent much of his life at his family home of Arundel Castle. From a family of Catholic recusants, Howard was imprisoned in the Tower of London for leaving the country without the permission of Queen Elizabeth. He died there ten years later. Early in the 17th century, Bosham-born Benedictine priest, George Gervase, was executed in London.\\n\\n17th century\\n\\nIn the 17th century, the diocese of Chichester was home to several Arminian bishops, including Bishops Andrews, Harsnett, Montagu, Duppa and King.\\n\\nIn the 1620s and 1630s many communities had licensed preachers. Lectureships at Rye, Lewes, Horsham and Midhurst extended preaching to the towns with the full support of the local gentry. From this time, Sabbatarianism gained ground with suppression of games and disorder. Bishop Montagu put forward extreme views against Puritanism and stressed the importance of ritual. Anthony Stapley, chairman of the Michaelmas quarter sessions in Sussex, was persuaded by Puritans to develop a harangue against the bishops in 1639, and in 1641 Stapley and Thomas Pelham petitioned Parliament on this issue. Latent hostility towards Catholics increased; and although Sussex contained as large a proportion of recusant households as many of the northern counties, few Catholic gentry in the county openly supported the king.\\n\\nThere were no battles of national significance in Sussex, during the 1642–1651 English civil war; however there were small sieges at Chichester and Arundel. The west of the county was generally royalist, although Chichester was for parliament and the east of the county, with some exceptions, was also for parliament. A few churches were damaged, particularly in the Arundel area. Also, after the surrender of Chichester, the Cathedral was sacked by Sir William Wallers parliamentary troops. Bruno Ryves, Dean of Chichester Cathedral said of the troops that \"they deface and mangle [the monuments] with their swords as high as they could reach\". He also complained that Waller\\'s troops...\\n\"... brake down the Organs and dashing the pipes with their Pole-axes...\"\\nMercurius Rusticus p. 139\\nDestruction of the cathedrals\\' music seems to have been one of the objectives, as Ryves also said, of Waller\\'s men, that...\\n\"they force open all the locks, either of doors or desks wherein the Singing-men laid up their Common-Prayer Books, their singing-Books, their Gowns and Surplesses they rent the Books in pieces and scatter the torn leaves all over the Church, even to the covering of the Pavement..\"\\nMercurius Rusticus p. 140\\n\\nIn 1643, Francis Bell, one of the priests at the Catholic mission in West Grinstead, was executed, along with other priests.  The Caryll family were frequently persecuted and fined.\\n\\nDuring Cromwell\\'s interregnum, Rye stood out as a Puritan \\'Common Wealth\\', a centre of social experiment and rigorous public morality under vicar Joseph Beeton and his successor John Allen. The people of Rye seem in general to have ignored the strict sabbatarianism enforced by the constables, particularly where \\'immoderate drinking\\' was concerned.\\n\\nSussex Quakers and emigration to British North America\\n\\nAbout a quarter of the incumbents were forced from their parishes and replaced with Puritans. Many people turned away from the traditional churches and in 1655 George Fox founded the Society of Friends at Horsham. Quakerism emerged in Sussex in the 1650s, to be firmly suppressed by a gentry concerned about its revolutionary tendencies. In 1656, Thomas Haycock of Horsham became the first person in Sussex to be sent to gaol for their Quaker beliefs. William Penn lived in the county for a while; in 1676 he bought the estate of Warminghurst, near Steyning. In 1677 a huge open air meeting of Quakers was held at Penn\\'s home in Warminghurst in defiance of the law, with several hundred Quakers attending. Then in 1681 Charles II granted Penn lands in what became Pennsylvania and Delaware. Amongst those whom he carried to Pennsylvania as colonists were 200 people from Sussex. In 1682 Penn left the Kent port of Deal for the Province of Pennsylvania with about 100 passengers, mostly Quakers and mostly from Sussex. Quakers to leave Sussex for Pennsylvania included Samuel Carpenter who founded Horsham Township, Pennsylvania; and in 1677 William Clayton left for Pennsylvania, where his family founded with others a township they called Chichester,  and opened the Chichester Friends Meetinghouse. Penn also created Sussex County and renamed the settlement of Hoernkills as Lewes.\\n\\nFollowing the Rye House Plot of 1683 a new wave of religious persecution swept across England.  Until the passing of the Toleration Act received royal assent in 1689 Quakers in Sussex and elsewhere had suffered considerable persecution, many of whom were imprisoned in Horsham Jail.   While living at Warminghurst, Penn too was persecuted for his Quaker faith.  The 1684 Chichester Quarter Sessions recorded that William Penn \"being a factitious and seditious person doth frequently entertain and keep an unlawful assemblage and conventicle in his dwelling house at Warminghurst to the terror of the King\\'s liege people.\"   Penn sold the estate, at Warminghurst, to a James Butler in 1707.\\n\\nThe Quakers in Sussex debated with Matthew Caffyn, a General Baptist preacher and writer, including George Fox and William Penn.  There is a well-known account in 1655 when two Quakers from the north of England, Thomas Lawson and John Slee, disputed doctrine with Caffyn.  As a result of their debates, Lawson produced a pamphlet entitled An Untaught Teacher Witnessed Against (1655) and Caffyn produced a pamphlet Deceived and Deceiving Quakers Discovered, Their Damnable Heresies, Horrid Blasphemies, Mockings, Railings (1656).  in 1696, Caffyn\\'s increasingly radical, unorthodox beliefs caused a schism in the General Baptist Assembly, and its response to his changing theology was significant in the development of Unitarianism.  The attorney-general of Rye, Samuel Jeake was exiled from the town after being found guilty of preaching under the Five Mile Act 1665.  He was forced to remain outside of Rye until 1687 when the toleration which James II extended to Protestant dissenters enabled him to return to Rye.\\n\\nThe Restoration of the English monarchy began in 1660 under Charles II.  It took over a year, after the restoration of Charles II in May 1660, for Chichester cathedral to get its choir back to full strength.\\n\\nIn the late 17th century, Sussex was a stronghold of the General Baptists.\\n\\nIn 1676 the Sussex parishes with the highest proportion of Catholics were almost entirely in the two most westerly Rapes of Chichester and Arundel: at least ten per cent of the population were Catholic in the parishes of Burton, Clapham, Coates, Midhurst, Racton, Shipley and Westfield.\\n\\nIn 1678 a former Hastings rector, Titus Oates fabricated the \"Popish Plot\", a supposed Catholic conspiracy to assassinate King Charles II and replace him with James (later James II).  The plot led to the false implication, imprisonment and execution of William Howard.  As a \\'Catholic of distinction\\' the seventh John Caryll from Sussex was imprisoned in the Tower of London but was let out on bail.  Following the persecutions and executions that followed the Titus Oates plot, the death penalty for being a priest was removed.  Instead, unscheduled fines were doubled and all remaining civil rights were removed from people keeping the Roman Catholic faith.  At this stage, most Sussex Catholic families conformed to the Anglican church, except notably for the Caryll family.   In 1688 the seventh John Caryll went into exile to Saint-Germain in France with James II as private secretary to James\\' queen, Mary of Modena.\\n\\nLate modern\\n\\n18th century\\nThere was a significant decline in non-conformity in Sussex in the early 18th century.  Between 1676 and 1724 the strength of non-conformity in the county was reduced by at least one quarter.  Around a third of the parishes in Sussex in 1724 had no dissenters.  For instance in 1676, Horsham had over 100 non-conformists but by 1724 there were just 34.\\n\\nThe number of dissenters fell from 4,300 in 1676 to around 3,300 in 1724.  In the 18th century, the Sussex grocer, Thomas Turner left a diary which suggests a high level of theological literacy amongst laypeople.  At this time, the Sussex Weald and bordering towns such as Lewes were home to a number of fundamentalist sects.  Cade Street Chapel in Heathfield was founded in 1769 for the followers of George Gilbert, who was popularly styled as \\'The Apostle of Sussex\\'.  Gilbert also preached in surrounding villages, often with great hardship and difficulty: at Ticehurst he was pelted with stones when the bells rang; at Bexhill he was plastered from head to toe in filth, and a large drum was played to drown out the sound of his voice until a woman put a knife into the drum.\\n\\nUnder Caffyn\\'s guidance a General Baptist chapel was founded in Horsham in 1719, bringing together Baptists who had met in small house-groups in the town since 1669 or possibly as early as 1645.  Worshippers from across northern Sussex came to this chapel; many were from the village of Billingshurst a few miles away.  This group later became large enough to split from the Horsham congregation and establish a chapel in their home village.\\n\\nMethodist pioneers came to the Rape of Hastings in 1756, with John Wesley visiting Rye in 1758.  Wesley\\'s last open air sermon was held in nearby Winchelsea in 1790.  The Countess of Huntingdon\\'s Connexion\\'s first church was set up in 1761 in North Street, Brighton in what was originally Selina, Countess of Huntingdon\\'s garden.\\n\\nSussex had a significantly larger proportion of Catholics than other southern counties.  Between 1715 and 1720, 8 per cent of the population of Sussex were registered as Catholic, a proportion more in common with counties north of a line from the River Severn to the Wash. John Baptist Caryll, the last of the Caryll family, was penalised for his Catholic faith and was forced in 1754 to sell his Sussex homes including that at West Grinstead. He endowed the Priest\\'s House to the Catholic Church via Lewes-born bishop Richard Challoner so that Catholic mass could be continued in the locality.   When Challoner visited the West Grinstead Mission in 1741 he found 80 Catholics at Mass.  Finally, history cannot forget the famous recusant, Maria Fitzherbert, who during this period secretly married the Prince of Wales, Prince Regent, and future George IV in 1785. The British Constitution, however, did not accept it and George IV later moved on. Cast aside by the establishment, she was adopted by the town of Brighton, whose citizens, both Catholic and Protestant, called her \"Mrs. Prince.\" According to journalist, Richard Abbott, \"Before the town had a [Catholic] church of its own, she had a priest say Mass at her own house, and invited local Catholics\", suggesting the recusants of Brighton were not very undiscovered.\\n\\n19th Century\\n\\nRoman Catholic Church\\nBrighton\\'s Roman Catholic community at the time of the Relief Act was small, but two factors caused it to grow in the 1790s. Many refugees from the French Revolution settled in Brighton after escaping from France; and Maria Fitzherbert, a twice-widowed Catholic, began a relationship with the Prince Regent (and secretly married him in 1785 in a ceremony which was illegal according to the Act of Settlement 1701 and the Royal Marriages Act 1772). She accompanied the Prince Regent whenever he visited Brighton, and had her own house (Steine House on Old Steine).\\n\\nThe first Catholic place of worship since the Reformation in Brighton was established above a shop in 1798; it was one of the earliest in Britain. In 1805 the priest in charge, a French émigré, started to raise money for a permanent building; a site on High Street, east of the Royal Pavilion and Old Steine, was found, and the Classical-style church was completed in 1807. It was demolished in 1981.\\n\\nIn 1818 the new rector, a friend of Maria Fitzherbert, wanted to extend the church. Mrs Fitzherbert donated £1,000 for this purpose, but before any action could be taken the events of 1829, when Catholic emancipation was fully achieved, encouraged Brighton\\'s Catholic community to seek a new site for a larger, more elaborate church. A piece of undeveloped land on the estate of the Marquess of Bristol was bought for £1,050, and William Hallett, later a mayor of Brighton, designed and built the new church of St John the Baptist. It was consecrated on 7 July 1835 and opened on 9 July 1835. Many of the 900 Catholic churches opened in England since the 1791 Roman Catholic Relief Act had not been consecrated by that stage, so St John the Baptist\\'s was only the fourth new church to be consecrated in England since the Reformation in the 16th century.\\n\\nFounded in 1873, St. Hugh\\'s Charterhouse, Parkminster is the first and only post-Reformation Carthusian monastery in the United Kingdom.  In 1876 the Shrine Church of Our Lady of Consolation of West Grinstead was established, becoming the first Catholic shrine in honour of Mary to be established in England since the Reformation.  Sussex was covered by the new Roman Catholic diocese of Southwark, created in 1850.  New priests for the Catholic diocese of Southwark began to train at West Grinstead until they could move to a larger domestic property at Henfield.   The diocese then moved its seminary to a purpose-built seminary in Surrey.\\n\\nNon-conformist churches\\nDespite Methodism\\'s early progress around Rye and Winchelsea in the Rape of Hastings, Methodism took longer to gain ground in the rest of Sussex.  Methodism in the coastal towns of Sussex had a very unusual origin in that it was Methodists in the army who were the main or contributory founders of Methodism in towns from Chichester to Bexhill, including Lewes.  Michael Hickman has argued that it was not until 1803 when Methodists and others in the army were allowed to worship freely on Sundays that Methodist soldiers could support or found Methodist societies in Sussex.  1805 saw the timber-framed Jireh Chapel open in Lewes, for Calvinist William Huntington whose tomb is at the rear of the chapel.\\n\\nThe General Baptist congregations at Billingshurst, Ditchling and Horsham gradually moved from General Baptist beliefs towards Unitarianism in the early 19th century.\\n\\nIn the mid 19th century John Sirgood founded the Society of Dependants at Loxwood in the north of the county.  Nicknamed the \\'Cokelers\\' their beliefs were largely derived from Wesleyan Arminianism. They believed in the people\\'s ability to exercise free will and thereby achieve salvation rather than the Calvinistic assertion of predestination.  They first established themselves at Loxwood because it was outside of the control of the large estates whose Anglican owners would have denied them land or premises.  As well as at Loxwood, the Society of Dependants went on to found places of worship at Chichester, Hove, Northchapel and Warnham, as well as at three locations in Surrey.\\n\\n1851 census\\nIn 1851 the authorities organised a census of places of worship in England and Wales. The figures for Sussex indicated that there were more Anglican than non-conformist places of worship. In the neighbouring counties of Hampshire and Kent, there were more non-conformist places than Anglican.\\n\\nThe 1851 census shows that the Anglican church was particularly strong in the west of the county.  These were areas where settlements were predominantly nucleated, with small parishes.  Thakeham had the second highest rate of Anglicans in England (96% Anglican).  Steyning, Petworth, Westhampnett and Westbourne were also over 80% Anglican.  Anglican churches did well in the coastal towns including Brighton.  In parts of the Sussex Weald the Anglican church had fewer churches than many other denominations, but not in terms of attendances at these churches.\\n\\nJust over 40% of the places of worship in Sussex in 1851 were non-conformist, mainly Independents, Wesleyan Methodists and Baptists.  There were also smaller congregations of Catholics, Quakers, Countess of Huntingdon\\'s Connexion and Unitarians.  Non-conformist chapels did well particularly in the Weald.\\n\\nOld dissent - dating back to Lollardy, such as Baptists, Unitarians and Quakers - remained more popular than new dissent and could be seen particularly in the Weald and on the Downs.  It was particularly noticeable in the towns such as Brighton, Shoreham, Hastings and Rye.   Some parts of Sussex were areas of strength for Baptists, but the west was an area of relative weakness.   Overall in Sussex, Wesleyan Methodism had some of the fewest adherents in Sussex in all of England.  However Wesleyan Methodism was strong in the rape of Hastings along the border with Kent; it was weakest in the county west of Eastbourne.  Primitive Methodists were almost absent from Sussex.\\n\\nPrimitive Methodists were almost completely absent from Sussex.   Of the 44 Sussex parishes with Catholics in 1676, only two, Arundel and Slindon, also had a Catholic place of worship in 1851.\\n\\nAnglo-Catholic reform in the Anglican Church and subsequent protest\\nIn the mid 19th century, divine Frederick William Robertson became well-known and preached at the Holy Trinity Church, Brighton.\\n\\nFormed in the 19th century, the cult of the Sussex martyrs was instigated at a time of the restoration of the Catholic hierarchy in England, bolstered by an increase in the Irish Catholic population, as well as the high-profile conversion to Catholicism of members of the Oxford movement, including Cardinal Newman and former Archdeacon of Chichester, Henry Edward Manning.  Mark Antony Lower, an anti-Catholic propagandist and schoolmaster from Lewes, inaugurated the cult of the Sussex martyrs after the publication of his 1851 book The Sussex Martyrs to recall the dire actions of Catholicism in Sussex.   Hostility to the Roman Catholic church, strong shortly after the Reformation had virtually died out by the early 19th century when religious tolerance was dominant mood.  This began to change with the Evangelical Revival.  The first Methodists to preach in Lewes were Calvinist Methodists, who saw the world as a sharp contrast between good and evil, God and the devil.  The natural recipients of their negative projections were Catholics, who were becoming tolerated in England.  More petitions were to come out of Lewes against Catholic emancipation that any other town in southern England.  They came not from the old dissenters who favoured toleration but from the newly-formed Calvinist congregations.  The local press in Lewes pandered to these prejudices.  The introduction of ritualist practices in the Anglican church further increased anti-Catholic attitudes in Lewes.\\n\\nIn the mid 19th century the practice of burning an effigy of Pope Paul V at the Lewes Bonfire celebrations began.  Paul V was a peaceable man who happened to be pope at the time of the Gunpowder Plot in 1605 and who cannot be held responsible for the Gunpowder Plot or the persecution of Protestants in the reign of Mary I, which were linked at this time by a misunderstanding of the past.  In 1893 William Richardson, rector of the Southover district of Lewes, held sermons on the Sunday before 5 November warning about the perils of Catholicism.  Many attendees were members of the newly-formed Orange Lodge in Lewes.\\n\\nAt the end of the 19th century and beginning of the 20th century, memorials were erected across Sussex and several other English counties to honour people burnt to death as heretics in the reigns of Henry VIII and Mary I.  These were largely a reminder of religious divisions of more than three centuries earlier which seemed remote from the public preoccupations of the day.  The actions could only be seen an anti-Catholic or at least anti-papal.  Whilst moderate supporters did not wish to offend the Catholic community, a memorial in Heathfield read \"burnt to death at Lewes by the Roman Catholics\".  These monuments did not commemorate the martyrdoms of Catholics or the Protestant opponents of state-imposed orthodoxy, except where they were erected by nonconformists.    Anger was directed against the Anglo-Catholic community more than Catholics.\\n\\nIn the Anglican church in the 19th century, the role of ritual became subject of great, often heated, debate.  In Brighton the Anglican church became influenced by the Oxford Movement, to an extent unparalleled elsewhere in the country apart from London.  In Anglo-Catholic circles, Brighton became associated with London, as in the collective title of \"London-Brighton and South Coast Religion\", a play on the name of the main railway company in Victorian Sussex, the \"London, Brighton and South Coast Railway\".  The railway, coincidentally or otherwise, linked all the large and growing centres of Anglo-Catholic worship spreading from London to Brighton and then east and west along coast of Sussex to the neighbouring counties of Kent and Hampshire. Anglo-Catholic priests in Brighton, included Henry Michell Wagner whose churches included St Paul\\'s Church and there was a powerful Protestant reaction including a riot in 1880. Brighton vicar Rev John Purchas was charged and ritualism spread to churches in Hastings and Worthing.  Various militant Protestant groups formed branches and lodges across the county. Richard Enraght was also tried, arrested and imprisoned. The prolific Anglo-Catholic hymnologist John Mason Neale was attacked by a mob and hostile demonstrations ensued at East Grinstead.\\n\\nIn 1884 rioting ensued in Worthing, Eastbourne and Shoreham as mobs of people including members of the Skeleton Army reacted to Salvation Army criticism.\\n\\nContemporary Christianity\\n\\nChurch of England\\nIn the Church of England in Sussex, the administration of the diocese of Chichester which covers the county was changed in 1912.  In addition to the existing archdeaconries of Chichester and Lewes that date from the 12th century, a third archdeaconry of Hastings was created.  This structure remained in place until the archdeaconries were reorganised under Eric Kemp in 1975.  The archdeaconry of Hastings was dissolved and merged back into the archdeaconry of Lewes, which was renamed the archdeaconry of Lewes and Hastings.  A new archdeaconry was created in the north of the county - the archdeaconry of Horsham.  This structure remained until 2014 when the archdeaconry of Hastings was recreated in the east of the county and the archeaconry of Lewes and Hastings was renamed the archdeaconry of Brighton and Lewes.  The suffragan Bishop of Horsham oversees the archdeaconries of Chichester and Horsham, while the suffragan Bishop of Lewes oversees the archdeaconries of Brighton & Lewes and Hastings.  The bishop of Chichester retains oversight over the entire diocese of Chichester i.e. all of Sussex.\\n\\nOn 16 November 2001, Pat Sinton, became the first woman priest in Sussex to be ordained.  Sinton was ordained by John Hind, the bishop of Chichester, following the departure of the previous bishop of Chichester, Eric Kemp.  Although Kemp had encouraged women to serve in the permanent diaconate in his diocese he had been an opponent of the ordination of women to the priesthood and women priests were not licensed in the Diocese of Chichester during his episcopate.  In September 2014 Fiona Windsor was made archdeacon of Horsham, making her the first female archdeacon in Sussex.  \\nThe Church of England in Sussex was damaged by sexual abuse scandals in the early 2000s.\\n\\nRoman Catholic Church\\nIn 1900 the Roman Catholic nun Maude Petre began a friendship with the Jesuit priest George Tyrell, which resulted in Petre building a cottage for Tyrell in the garden of her Storrington home.  Both Petre and Tyrell were major figures in the Modernist controversy of the early 20th century.  The Roman Catholic Diocese of Arundel and Brighton was formed in 1965 out of part of the diocese of Southwark.  It includes Sussex and Surrey.  In the early 2000s, the sexual abuse scandal in the Arundel and Brighton diocese hurt the public\\'s trust in the work of local diocesan officials.\\n\\nRelations with Sussex churches\\nAppointed as Bishop of Chichester in 1929, George Bell was a vocal supporter of the German resistance to Nazism and a pioneer of the Ecumenical Movement that aimed for greater co-operation between churches.  Bell established in 1955 the first ever County Council of Churches in Sussex, since which similar structures have been formed in other parts of England.\\n\\nThere is a history of religious antagonism and anti-popery around the bonfire celebrations in Lewes.  In the 1930s the mayor of Lewes requested that \\'no popery\\' banners be removed and an end to the burning of effigies of Pope Paul V.  In the 1950s the Cliffe Bonfire Society was banned from the Bonfire Council from taking part in the United Grand Procession for its refusal to stop carrying a \\'no popery\\' banner and banners commemorating the 16th century Protestant martyrs burned at Lewes.  In Lewes, women were to a significant degree responsible for using the spirit of ecumenism to build bridges between the denominations that had until then continued to be anti-Catholic.   In 1984 Sussex church leaders were invited to Lewes to discuss Protestant-Catholic relations.  Attendees included Eric Kemp, Bishop of Chichester, Peter Ball, suffragan Bishop of Lewes and Cormac Murphy-O\\'Connor, Bishop of Arundel and Brighton, as well as their equivalent positions in the Baptist, Methodist and United Reformed churches.  In a historic gesture after the meeting the leaders walked to the Martyrs\\' memorial and prayed for peace and reconciliation.  The owners of the memorial, associated with Jireh Chapel, subsequently threatened the intruders for trespassing.  The LDCC later persuaded BBC to make a Songs of Praise TV programme in Lewes on the theme of religious tolerance, broadcast on 5 November 1989.  To many though, the bonfire celebrations have lost much of their religious meaning, with many Catholics taking part.  There are parallels with the carnival celebrations that took place across western Europe when the established order was turned upside down and the lord of misrule held sway for the day.  In 1981 Ian Paisley visited Lewes on Bonfire Night and tried to fan the flames of conflict by handing out anti-Catholic pamphlets.  His intervention back-fired and the following year he was burned in effigy.  Today, anti-Catholic attitudes are rare and the militant Calvinism that continues in Northern Ireland is all but extinct in Lewes.\\n\\nIn the 21st century, controversy continues to be associated around the Bonfire societies and competing definitions of tradition and bigotry.  For instance, the burning in effigy of Pope Paul V was described in 2012 as \"a scandalous piece of stone-cold bigotry\"\\n\\nOther Christian denominations\\nEstablished in 1971 the Anabaptist Bruderhof community was founded near Robertsbridge, the earliest such community remaining in Europe.  From the 1980s, Sussex has three Greek Orthodox churches - at Brighton, Hastings and Eastbourne.\\n\\nFollowing the Second Sudanese Civil War, many refugees came to Brighton and Hove and neighbouring areas.  Hove and Worthing are now home to Coptic Orthodox Churches, two of 28 such churches in the British Isles.  The churches were visited in 2017 by Pope Tawadros II of Alexandria and Bishop Paula of Tanta.  In 1998 the congregation at Jireh Chapel in Lewes took the decision to affiliate with the Free Presbyterian Church of Ulster.  The church is one of seven such churches established in England.\\n\\nIn the Old Roman Catholic Church in Europe in 2012, Jerome Lloyd was made Metropolitan Archbishop of Selsey (officially \"Archbishop Metropolitan of the Isle of the Seals (Selsey) and the New Market of the Regnenses (i.e. of the Celtic tribe the Romans conquered in AD43, now called Chichester) in the Kingdom of the South Saxons (i.e. Sussex)\".  Based in Brighton, the archbishop is one of a small number of priests who broadcast the traditional Mass in Latin live, via the internet and is the only priest to do so in Europe.  The archbishop works on various projects to help homeless people in Brighton.\\n\\nThe turn of the 21st century saw the rise of so-called mega-churches and neo-charismatic and evangelical churches including Kingdom Faith in Horsham, set up by Colin Urquhart and the Newfrontiers group founded by Terry Virgo.\\n\\nCurrent and former places of worship\\nLists of all current and former places of worship in Sussex by district are as follows:\\n\\n Adur District\\n Arun District\\n Brighton and Hove\\n Chichester (current)\\n Chichester (former)\\n Crawley\\n Eastbourne\\n Hastings\\n Horsham District\\n Lewes District\\n Mid Sussex\\n Rother\\n Wealden (current)\\n Wealden (former)\\n Worthing\\n\\nSee also\\n History of Christianity in England\\n History of Sussex\\n Religion in Sussex\\n List of monastic houses in East Sussex\\n List of monastic houses in West Sussex\\n History of local government in Sussex\\n\\nBibliography\\n\\nReferences\\n\\nChristianity in Sussex\\nHistory of Sussex\\nHistory of Christianity in England'}]}"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset['dev'][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Each passage in the corpus has two parts: `docid` and `text`. `docid` has the form of `doc-<language>-<id>`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "corpus = load_dataset('Shitao/MLDR', f\"corpus-{lang}\", trust_remote_code=True)['corpus']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'docid': 'doc-en-9633',\n",
+       " 'text': 'Mars Hill Church was a Christian megachurch, founded by Mark Driscoll, Lief Moi, and Mike Gunn. It was a multi-site church based in Seattle, Washington and grew from a home Bible study to 15 locations in 4 U.S. states. Services were offered at its 15 locations; the church also podcast content of weekend services, and of conferences, on the Internet with more than 260,000 sermon views online every week. In 2013, Mars Hill had a membership of 6,489 and average weekly attendance of 12,329. Following controversy in 2014 involving founding pastor Mark Driscoll, attendance dropped to 8,0009,000 people per week.\\n\\nAt the end of September, 2014, an investigation by the church elders found \"bullying\" and \"patterns of persistent sinful behavior\" by Driscoll. The church elders crafted a \"restoration\" plan to help Driscoll and save the church. Instead, Driscoll declined the restoration plan and resigned. On October 31, 2014, lead pastor Dave Bruskas announced plans to dissolve the church\\'s 13 remaining campuses into autonomous entities, with the option of continuing, merging with other congregations, or disbanding, effective January 1, 2015. The Mars Hill network dissolved on January 1, 2015.\\n\\nHistory\\n\\nEarly years \\nMars Hill Church was founded in spring 1996 by Mark Driscoll,  Lief Moi and Mike Gunn. The church started at the rental house of Driscoll and his wife Grace with the blessing of Antioch Bible Church and the exodus of about 30 of its students. They outgrew the apartment and started meeting in the youth rooms of another church. The church had its first official service October 1996, with 160 people attending; attendance quickly fell to around 60 because of discussions about the visions and mission of the church.\\n\\nIn the spring of 1997, the church expanded to two evening services. The transition to two different congregations resulted in some anxiety and stir by members who didn\\'t want the church to grow bigger, but it resulted in growing attendance. Later that same year Mark Driscoll was invited to speak at a pastors\\' conference in California. Driscoll\\'s speech influenced the emerging church movement, and changed the focus from reaching Generation X to reaching the postmodern world. The speech resulted in media coverage of Mars Hill Church and Mark Driscoll, and put Driscoll in connection with Leadership Network.\\n\\nThe church continued growing. Inspired by Alan Roxburgh, Driscoll settled on an emerging and missional ecclesiology, and a complementarian view on women in ministry. The church installed the first team of elders and they took over much of the work teaching classes, counseling and training new leaders. Furthermore, the church started a course for new members, called the Gospel Class, to ensure that members were focused on the mission of the church and that they agreed with the central doctrinal statements of the church. The class had been running every quarter since. In the fall of 1999 the church had grown to 350 in attendance every week and was able to pay Driscoll full-time. Prior to 1999, Driscoll operated as an unpaid pastor for three years.\\n\\nMultisite church \\n\\nIn 2003, Mars Hill Church moved into a renovated hardware store in the Ballard neighborhood of Seattle.  In 2006, in an effort to reduce the overcrowding at its services, Mars Hill opened its first satellite campus in Shoreline.  This change also marked their transition to a multi-site church, using video sermons and other multimedia improvements to the church\\'s web site to connect the campuses. Later in 2006 Mars Hill acquired two new properties in West Seattle and Wedgwood, which became their West Seattle and Lake City campuses.\\n\\nSince then, new Mars Hill locations were added using a multi-campus \"meta-church\" structure, connecting Driscoll\\'s sermons via high-definition video to the remote campuses during weekly worship services.  This format allowed each location to retain local leadership and ministries while under the leadership of the main campus.  A fourth and fifth Mars Hill location opened in 2007, and in 2008 a sixth location was added in downtown Seattle. A seventh campus, in Olympia, Washington, opened in Fall 2008  and an eighth campus, the first outside of Washington state, opened in Albuquerque, New Mexico in Fall 2009. The church launched four new churches on January 15 in Portland (Oregon), Rainier Valley (Seattle), Sammamish (near Seattle), and Orange County (California), the same day as the first sermon in the \"Real Marriage\" sermon series, based on Mark and Grace Driscoll\\'s book, Real Marriage.\\n\\nOn October 16, \"black-clad demonstrators\" gathered in front of the Mars Hill Church in Southeast Portland to \"protest the church\\'s stance on homosexuality.\" Approximately 20 protesters, \"some of whom wore kerchiefs to cover their faces, shouted profanities at adults and children,\" and briefly blocked the entrance of the church. Mars Hill Church Portland lead pastor Tim Smith expressed disagreement with the conduct of the protesters, but expressed defense of their right to free speech.\\n\\nIn 2008, the church launched an online community-building network, called The City, to improve communication on all levels in the church. The City was purchased by the Christian publishing brand, Zondervan, before Christmas 2008.\\n\\nGrowth and influence \\n\\nIn 2013, The Church Guide released a list of the \"Top Churches to Watch in America\". The link ranked churches according to how much churches could learn from the ranked churches on particular topics. They ranked Mars Hill Church as #3 to learn from about church growth, #3 for innovation, #2 for church planting, and #4 overall. The list considered data from Outreach magazine\\'s annual lists from 2004–2012 and other sources.\\n\\nIn 2006, Mars Hill Church claimed $31,110,000 in assets.\\n\\nActs 29 Church Planting Network \\n\\nActs 29 Church Planting Network is a separate 501(c)(3) from Mars Hill Church but was founded by Mars Hill in 2001. It is an interdenominational network of pastors and churches from around the world whose focus is to assess and equip qualified leaders, plant new churches, and rejuvenate declining churches.  The current president of Acts 29 is Matt Chandler.  The offices and leadership of Acts 29 moved from Mars Hill Church in Seattle to The Village Church in Texas in March 2012.\\nIn August 2014, Acts 29 removed Mark Driscoll and Mars Hill Church from the network.\\n\\nChurch leadership controversies\\n\\nDealing with dissent \\n\\nAs a result of the large growth of the church, its bylaws were rewritten more than once. The outcome of this process led to changes in leadership organization in November 2007. The new bylaws installed lead pastor Jamie Munson, preaching pastor Mark Driscoll, and pastors Scott Thomas and Tim Beltz as \"executive pastors\" who led the objectives of the church \"under the authority of the Board of Directors,\" on which the executive pastors also served as directors. This change precipitated the firing of two pastors.\\n\\nMars Hill leaders said in forum postings that one fired pastor was removed, in part, for \"displaying an unhealthy distrust in the senior leadership.\" They said the other was removed for \"disregarding the accepted elder protocol for the bylaw deliberation period\" and \"verbally attacking the lead pastor\"\\xa0— charges the fired pastor denied, the leaders added.\\n\\nChurch leadership instructed members of the congregation to shun the two former elders as unrepentant. Former Mars Hill Church elders and members have criticized the church for its harshness in dealing with dissent within its leadership. Additionally, members who have openly questioned or dissented with Mars Hill leaders have been asked to leave the church.  This policy of church discipline was discussed during a lecture given on April 20, 2009 by Mark Driscoll for The Gospel Coalition.\\n\\nIn early 2012, the church once again became a source of controversy over shunning and disciplinary proceedings when a young man under discipline released documents from his disciplinary contract to blogger and author Mathew Paul Turner. The documents included a discipline contract and an email from church leaders to the congregation directing them to shun him.\\n\\nResultSource contract for the Real Marriage Book \\nOn March 5, 2014, evangelical magazine World published an article claiming that Mars Hill Church paid a $25,000 fee to marketing firm ResultSource, to manipulate sales numbers of Mark Driscoll\\'s book Real Marriage and thereby attain a place on the New York Times bestseller list. ResultSource accomplished this objective—the book briefly reached #1 in the \"Advice How-to\" category—by buying 11,000 copies of the book, using $210,000 of Mars Hill Church\\'s money, from a variety of online sources and payment methods.\\n\\nThe Evangelical Council for Financial Accountability stated that buying a place on bestseller lists violates its ethical standards, but that because this happened before Mars Hill Church joined they were unable to take action. Christianity Today described the arrangement as \"ethically questionable\", and Carl Trueman of religion journal First Things decried the revelation, writing, \"the overall picture is one of disaster\" and \"[it] has raised questions not simply about personal integrity but also the very culture of American Evangelicalism.\"\\n\\nDriscoll had used the apparent success of Real Marriage to negotiate a multi-book deal with Christian publisher Tyndale House. The first book under Driscoll\\'s \"Resurgence\" imprint was A Call to Resurgence, with plans to publish five to seven books per year. Tyndale House defended Driscoll\\'s alleged plagiarism in A Call to Resurgence, and affirmed their continuing relationship with Driscoll.\\n\\nMars Hill Church responded with a statement, writing, \"while not uncommon or illegal, this unwise strategy is not one we had used before or since, and not one we will use again.\" Mars Hill also claimed that the \"true cost\" of the effort was less than \"what has been reported.\"\\n\\nOn March 17, 2014, Driscoll posted an open letter of apology in response to this controversy and others, writing that he will no longer claim to be a New York Times bestselling author, and that he now sees the ResultSource marketing campaign as \"manipulating a book sales reporting system, which is wrong.\" He wrote that he was giving up his status as a \"celebrity pastor\", that he considered his \"angry young prophet\" days to be over, and that he was reducing his public presence in speaking engagements and on social media.\\n\\nOn March 28, 2015, Sutton Turner, a former elder of the church who signed the Result Source contract, explained that he disapproved of the marketing plan to use Result Source, but the decision to use it had already been made before he began work at Mars Hill, so he signed the contract anyway. Turner revealed that Driscoll had not been involved in initiating nor signing the contract with Result Source. Turner stated that the business relationship with the marketing firm was initiated by a pastor who resigned shortly thereafter, and remaining church leaders disagreed over the completion of the contract, stating that it would reflect badly on the church and Mark Driscoll.\\n\\nPlagiarism allegations \\nOn November 21, 2013, radio host Janet Mefferd accused Driscoll of plagiarism. Mefferd claimed that 14 pages of Driscoll\\'s book A Call to Resurgence quoted \"extensively and without citation\" from Peter Jones\\' 1999 book, Gospel Truth/Pagan Lies: Can You Tell the Difference? and Jones\\' 2010 book One or Two: Seeing a World of Difference. Driscoll\\'s publisher Tyndale House stated that they performed a \"thorough in-house review\" and disagreed that this was a case of plagiarism. Neil Holdway, a plagiarism expert with the American Copy Editors Society, concluded that \"Driscoll had not adequately indicated the extent to which he had borrowed Jones\\' work.\"\\n\\nMore allegations of plagiarism in other Driscoll works soon surfaced, including passages from a sermon series companion text, Trial: 8 Witnesses From 1&2 Peter, which were copied verbatim from passages written by David Wheaton in the New Bible Commentary. InterVarsity Press, publisher of the New Bible Commentary, stated that Driscoll failed to properly provide quotation or attribution for the material. The relevant passages were posted online. The allegations soon expanded to include claims that Driscoll used ghostwriters and researchers without giving them proper attribution. As of December 2013, neither Peter Jones, D.A. Carson, nor Janet Mefferd had made any further statements pertaining the case.\\n\\nSyndicator Salem Radio subsequently removed both the broadcast interview with Driscoll and associated materials from Mefferd\\'s program website and apologized for raising the matter in a broadcast interview. This attempt to shut down the story provoked the resignation of Mefferd\\'s producer, Ingrid Schlueter. In explaining her resignation, Schlueter wrote the following regarding herself and Mefferd:\\n\\nDriscoll apologized for \"mistakes\" related to the allegations in a statement released to The Christian Post on December 18, 2013. Mefferd eventually left Salem Radio in April 2015.\\n\\nMars Hill Global Fund \\nIn June 2014 an online petition asked Sutton Turner of Mars Hill Church and Dan Busby of the Evangelical Council for Financial Accountability where the money raised through Mars Hill Global Fund actually went. The church reported that \"Mars Hill Church began to use the term \\'Global Fund\\' to solicit gifts restricted for \\'capital development and expansion\\'.  As communicated in the Global Newsletter on July 7, 2009, the Global Fund was used to raise resources for the following purposes:  \\'start new Mars Hill campuses, plant new Acts 29 churches, and equip leaders at the Resurgence Training Center\\'. In the 2009-2011 time frame, over 80% of the funds given to the \"Global Fund\" went to Acts 29 church planting, with additional funds used for the Resurgence Training Center and church planting in India.\" Additionally, \"subsequent to June 1, 2012, in early July 2014, Mars Hill Church sent approximately 6,000 letters and 3,765 emails to individuals who had made gifts as a global donor subsequent to June 1, 2012. In these communications, Mars Hill Church offered to redirect the donor\\'s gifts, made as a global donor during this time period, specifically for planting churches in Ethiopia or India.\"\\n\\nFormer leaders and members protest Mark Driscoll (2014) \\nMichael Paulson, writing for The New York Times, wrote that while Driscoll had endured criticism from the American political left and liberal Christianity for many years, recent years leading up to and including 2014 saw the rise of criticism from conservative Christians, including Driscoll\\'s former \"allies and supporters.\" According to the Seattle Times, plagiarism accusations against Driscoll made by Janet Mefferd were a \"crucial turning point\" that drew outside interest into Mars Hill\\'s internal affairs, and prompted inquiries from new critics about the church and how it handled its finances. After hearing of Mefferd\\'s plagiarism accusations, evangelical Christian and Grove City College psychology professor Warren Throckmorton took interest and became a prominent critic of Driscoll and Mars Hill, documenting other examples of perceived plagiarism, abuse reported by former Mars Hill members, and questionable uses of church finances.\\n\\n\"Repentant Pastors\" \\nOn March 29, 2014, four former Mars Hill elders (Kyle Firstenberg, Dave Kraft, Scott Mitchell, and co-founder Lief Moi) created a blog titled \"Repentant Pastor\" and posted online \"confessions and apologies\" related to their leadership roles in Mars Hill. In a joint statement, they wrote, \"we recognize and confess that Mars Hill has hurt many people within the Mars Hill community, as well as those outside the community.\" Salon summarized the statements, writing that the former leaders emphasized their failures to \"rein Driscoll in\" and their complicity with Driscoll\\'s \"autocratic\" management style. Firstenberg wrote that while the church appeared to flourish, employees lived in constant stress, and \"success was to be attained regardless of human and moral cost.\"\\n\\nMegachurch pastors come to Driscoll\\'s defense \\nSeveral prominent pastors publicly defended Driscoll from allegations made against him. Those pastors included mega-church pastor Rick Warren, author of The Purpose Driven Life, and Gateway Church\\'s founding pastor Robert Morris. At the 2014 Gateway Conference, Morris told the audience that he counseled Mark Driscoll directly, and that media reports were largely untrue. Morris cited recent media reports of lead pastor Steven Furtick of Elevation Church as experiencing similar coverage. At the conference, Mark Driscoll was invited up to the stage where he told the audience that he received death threats and that his children allegedly had rocks thrown at them. Driscoll stated that \"I\\'m just trying to figure out how to be a good pastor to my family first.\"\\n\\nDriscoll addresses former members\\' complaints \\nIn a recorded message shown to church members on July 27, 2014, Driscoll discussed the various controversies of 2014. He said that he could \"not address some members\\' discontent ... because the complaints were anonymous.\" According to Rob Smith, former program director at the church, the anonymity assertion \"really touched a nerve\" with former members. In response, dissenters organized a Facebook group called \"Dear Pastor Mark & Mars Hill: We Are Not Anonymous.\"\\n\\nThe following Sunday, \"dozens of demonstrators\" organized and picketed the Mars Hill Church Bellevue campus (where Driscoll preached live), calling for Driscoll\\'s resignation. Demonstrators carried placards reading \"We Are Not Anonymous\" and \"Question Mark\", and accused Driscoll of bullying, misogyny, inadequate transparency in church finances, and harsh discipline of members. Driscoll was away for his annual summer vacation. A church elder, Anthony Iannicielo, responded that the criticism of Driscoll and Mars Hill \"goes with the territory\" of running a large church with a long history. In a pre-recorded message, Driscoll said that he had been deliberately \"rather silent\" during the criticism, that he found it \"a little overwhelming and a bit confusing\", and that he had no intention of resigning.\\n\\nRemoval from Acts 29 Network \\nOn August 8, 2014, the board of Acts 29 Network removed both Driscoll and Mars Hill Church from membership. Chairman Matt Chandler wrote, \"it is our conviction that the nature of the accusations against Mark, most of which have been confirmed by him, make it untenable and unhelpful to keep Mark [Driscoll] and Mars Hill [Church] in our network.\"\\xa0The board of directors of Acts 29 expressed gratitude for Driscoll\\'s work with the Network as co-founder and former President, but declared his recent actions \"ungodly and disqualifying behavior.\" To Driscoll, they wrote, \"our board and network have been the recipients of ... dozens of fires directly linked to you ... we are naturally associated with you and feel that this association discredits the network and is a major distraction.\" They further advised him to \"step down from ministry for an extended time and seek help.\"\\n\\nActs 29 had attempted to \"lean on\" the Mars Hill\\'s Board of Advisors and Accountability (BOAA) to discipline Driscoll, but lost confidence in the board. The BOAA had been set up by Driscoll as his accountability board, rather than the elders of the church. (Members of the BOAA were for the most part professional clergy and businessmen who were not members of the church and hand picked by Driscoll.) The previous month, evangelical leaders and Acts 29 associates Paul Tripp and James MacDonald resigned from the BOAA. Religion correspondent Sarah Pulliam Bailey described Acts 29\\'s decision as \"unusual\" since \"ministries usually leave matters of church discipline up to local churches.\"\\n\\nBOAA Chairman Michael Van Skaik responded, \"Men, I told the lead pastors ... that we are making real progress in addressing the serious reconciliation and unhealthy culture issues that have been part of Mars Hill Church for way too long. And we are. ... \" He further added that Acts 29 leaders did not contact Mars Hill before acting, and that Driscoll had \"changed his ways\", and described Acts 29\\'s actions as \"divisive.\" Van Skaik also addressed the formal charges brought against Driscoll under the Mars Hill bylaws, writing \"the formal charges that were filed were serious, were taken seriously, and were not dismissed by the board lightly.\"\\n\\nDriscoll\\'s hiatus from ministry \\nOn August 24, 2014, Driscoll announced he would take a six-week \"extended focus break\" from his pastorship while charges against him were investigated. Later that week, a letter signed by nine current Mars Hill pastors which severely criticized Driscoll was leaked to the public. The letter, written days before Driscoll stepped down, urged him to step down from all aspects of ministry. It included a quote from \"internationally recognized\" author, pastor and former BOAA member Paul Tripp saying, \"This is without a doubt, the most abusive, coercive ministry culture I\\'ve ever been involved with.\" One of the pastors who signed the letter was fired five days later for \"rebellion against the church.\" By September 9, eight of the nine pastors who signed the letter had resigned or been terminated, including worship director Dustin Kensrue. The last of the nine pastors was demoted from pastor to lay elder.\\n\\nStaff layoffs and closure of church branches \\n\\nOn September 7, 2014 (the second week of Driscoll\\'s hiatus), Mars Hill officials, citing \"financial pressures in the wake of recent negative media attention\", announced layoffs and closures of a few church branches. Weekly attendance at the start of the year for all branches was 12,000–13,000, but had dropped to 8,000–9,000. Donations also had a \"steep decline.\" In response, the church planned to lay off \"30 to 40 percent\" of their 100 paid staff members, and close their downtown Seattle branch and University District branch, consolidating both congregations into the Ballard location. Two other branches outside Washington state were marked for possible closure if their finances did not improve. Mars Hill also announced the resignation of Sutton Turner, executive elder since 2011, effective at the end of September 2014.\\n\\nDriscoll\\'s resignation \\nIn the fall of 2014, a group of elders released a report on an investigation into accusations of bullying and intimidating behavior by Driscoll made by 21 former church elders. The investigation involved \"some 1,000 hours of research, interviewing more than 50 people and preparing 200 pages of information.\" The report concluded that Driscoll had never been charged with \"immorality, illegality or heresy,\" and considered \"some of the accusations against Pastor Mark to be altogether unfair or untrue.\" Additionally, the report found that many of the \"other charges had previously been addressed by Pastor Mark, privately and publicly. Indeed, he had publicly confessed and apologized for a number of the charges against him, some of which occurred as long as 14 years ago.\" However, elders did find \"bullying\" and \"patterns of persistent sinful behavior\" by Driscoll. The Board also concluded that Driscoll had \"been guilty of arrogance, responding to conflict with a quick temper and harsh speech, and leading the staff and elders in a domineering manner\", but was not charged with anything immoral or illegal. Driscoll maintained that he had not disqualified himself from ministry.\\n\\nChurch leadership crafted a \"restoration\" plan to help Driscoll and save the church. Instead, Driscoll declined the restoration plan and resigned on October 14, 2014, citing concerns for his health and safety. His resignation came as a \"surprise\" to the church\\'s Board of Overseers, who said in a statement that they had not asked Driscoll for his resignation.\\n\\nIn 2015, after the disbanding of Mars Hill, an executive elder of the church stated that \"There has been much talk about the abusive and coercive culture at Mars Hill. What many people do not realize is that some of the very people who were calling for an end to this type of abuse were using abusive tactics.\" The executive elder stated that he was blackmailed by a staff who asked for more severance pay. He also stated that \"former Mars Hill elders were working to file formal charges against me also. I was told that a former lead pastor was approached to lead a group of people who hoped to force my resignation so that I \\'could not help Pastor Mark Driscoll\\'.\"\\n\\nPastor and theologian John Piper referred to the controversies and subsequent church closure as a \"Satanic victory.\"\\n\\nIt was a defeat for the gospel, it was a defeat for Mark [Driscoll], it was a defeat for evangelicalism, for Reformed Theology, for complementarianism ... It was a colossal Satanic victory.Driscoll\\'s resignation is thoroughly investigated in the podcast The Rise and Fall of Mars Hill.\\n\\nClosing \\nOn October 31, 2014, lead pastor Dave Bruskas announced plans to dissolve the church\\'s 13 remaining campuses into autonomous entities, with the option of continuing, merging with other congregations, or disbanding, effective January 1, 2015.\\n\\nOn December 28, 2014, Rick Warren gave the final Sunday sermon at Mars Hill, encouraging its remaining members to \"give grace\" to its leaders, \"You need to be grateful for all the ways that God used Mars Hill Church. Be grateful for all the ways God used Mark Driscoll.\" Driscoll had previously delivered a sermon at Saddleback Church the weekend Rick Warren grieved the loss of his son.\\n\\nThe Mars Hill Church network officially disbanded Thursday, January 1, 2015. Eleven of the Mars Hill Churches became independent churches and the remaining churches were dissolved.  Prior to the churches disbanding, Mars Hill transferred the majority of its content from its website to  where the church\\'s sermons remain. The Mars Hill website now contains a history of the church and a church directory of the previous Mars Hill churches locations with their new names and websites.\\n\\nPrior to disbanding on January 1, 2015, Mars Hill Church met at twelve locations, mostly in Seattle and Washington state, with three out of state locations in New Mexico, California, and Oregon. A few locations were closed or consolidated on October 12, 2014. After January 1, 2015, each church location dissolved into an independent congregation.  The remaining members of Mars Hill Ballard reorganized as Cross and Crown Church Seattle, led by former Mars Hill Downtown pastor Matthias Haeusel at Mars Hill\\'s former Ballard location.\\n\\nIn February 2016, a federal racketeering lawsuit was filed by former Mars Hill members against both Mars Hill and Driscoll. That lawsuit was dismissed in November 2016 after the plaintiffs said they did not have the money to continue the suit. The plaintiffs\\' online fundraising campaign on GoFundMe had raised $34,660, which was approximately half of its goal.\\n\\nReferences\\n\\nFurther reading\\n\\n Pastor Dude\\'s Mega-Church Draws Crowds - ABC Nightline story about Mars Hill Church\\n Tempers Flare at Debate on the Devil - ABC Nightline debate at Mars Hill Church on the Devil\\n\\nExternal links\\n \\n\\n \\nEmerging church movement\\nEvangelical churches in Washington (state)\\nFormer megachurches\\nChurches in Seattle\\nChristian organizations established in 1996\\nReligious organizations disestablished in 2015'}"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "corpus[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we process the ids and text of queries and corpus for preparation of embedding and searching."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "corpus_ids = corpus['docid']\n",
+    "corpus_text = corpus['text']\n",
+    "\n",
+    "queries_ids = dataset['dev']['query_id']\n",
+    "queries_text = dataset['dev']['query']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Evaluate from scratch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Embedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the demo we use bge-base-en-v1.5, feel free to change to the model you prefer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 60.08it/s]\n",
+      "pre tokenize: 100%|██████████| 782/782 [02:22<00:00,  5.50it/s]\n",
+      "Inference Embeddings: 100%|██████████| 782/782 [02:47<00:00,  4.66it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shape of the embeddings: (200000, 768)\n",
+      "data type of the embeddings:  float16\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import FlagModel\n",
+    "\n",
+    "# get the BGE embedding model\n",
+    "model = FlagModel('BAAI/bge-base-en-v1.5',)\n",
+    "                #   query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\")\n",
+    "\n",
+    "# get the embedding of the queries and corpus\n",
+    "queries_embeddings = model.encode_queries(queries_text)\n",
+    "corpus_embeddings = model.encode_corpus(corpus_text)\n",
+    "\n",
+    "print(\"shape of the embeddings:\", corpus_embeddings.shape)\n",
+    "print(\"data type of the embeddings: \", corpus_embeddings.dtype)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Indexing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a Faiss index to store the embeddings."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total number of vectors: 200000\n"
+     ]
+    }
+   ],
+   "source": [
+    "import faiss\n",
+    "import numpy as np\n",
+    "\n",
+    "# get the length of our embedding vectors, vectors by bge-base-en-v1.5 have length 768\n",
+    "dim = corpus_embeddings.shape[-1]\n",
+    "\n",
+    "# create the faiss index and store the corpus embeddings into the vector space\n",
+    "index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n",
+    "corpus_embeddings = corpus_embeddings.astype(np.float32)\n",
+    "# train and add the embeddings to the index\n",
+    "index.train(corpus_embeddings)\n",
+    "index.add(corpus_embeddings)\n",
+    "\n",
+    "print(f\"total number of vectors: {index.ntotal}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Searching"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the Faiss index to search answers for each query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Searching: 100%|██████████| 7/7 [00:01<00:00,  5.15it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "query_size = len(queries_embeddings)\n",
+    "\n",
+    "all_scores = []\n",
+    "all_indices = []\n",
+    "\n",
+    "for i in tqdm(range(0, query_size, 32), desc=\"Searching\"):\n",
+    "    j = min(i + 32, query_size)\n",
+    "    query_embedding = queries_embeddings[i: j]\n",
+    "    score, indice = index.search(query_embedding.astype(np.float32), k=100)\n",
+    "    all_scores.append(score)\n",
+    "    all_indices.append(indice)\n",
+    "\n",
+    "all_scores = np.concatenate(all_scores, axis=0)\n",
+    "all_indices = np.concatenate(all_indices, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = {}\n",
+    "for idx, (scores, indices) in enumerate(zip(all_scores, all_indices)):\n",
+    "    results[queries_ids[idx]] = {}\n",
+    "    for score, index in zip(scores, indices):\n",
+    "        if index != -1:\n",
+    "            results[queries_ids[idx]][corpus_ids[index]] = float(score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Evaluating"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Process the qrels into a dictionary with qid-docid pairs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qrels_dict = {}\n",
+    "for data in dataset['dev']:\n",
+    "    qid = str(data[\"query_id\"])\n",
+    "    if qid not in qrels_dict:\n",
+    "        qrels_dict[qid] = {}\n",
+    "    for doc in data[\"positive_passages\"]:\n",
+    "        docid = str(doc[\"docid\"])\n",
+    "        qrels_dict[qid][docid] = 1\n",
+    "    for doc in data[\"negative_passages\"]:\n",
+    "        docid = str(doc[\"docid\"])\n",
+    "        qrels_dict[qid][docid] = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, use [pytrec_eval](https://github.com/cvangysel/pytrec_eval) library to help us calculate the scores of selected metrics:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "defaultdict(<class 'list'>, {'NDCG@10': 0.35304, 'NDCG@100': 0.38694})\n",
+      "defaultdict(<class 'list'>, {'Recall@10': 0.465, 'Recall@100': 0.625})\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pytrec_eval\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "ndcg_string = \"ndcg_cut.\" + \",\".join([str(k) for k in [10,100]])\n",
+    "recall_string = \"recall.\" + \",\".join([str(k) for k in [10,100]])\n",
+    "\n",
+    "evaluator = pytrec_eval.RelevanceEvaluator(\n",
+    "    qrels_dict, {ndcg_string, recall_string}\n",
+    ")\n",
+    "scores = evaluator.evaluate(results)\n",
+    "\n",
+    "all_ndcgs, all_recalls = defaultdict(list), defaultdict(list)\n",
+    "for query_id in scores.keys():\n",
+    "    for k in [10,100]:\n",
+    "        all_ndcgs[f\"NDCG@{k}\"].append(scores[query_id][\"ndcg_cut_\" + str(k)])\n",
+    "        all_recalls[f\"Recall@{k}\"].append(scores[query_id][\"recall_\" + str(k)])\n",
+    "\n",
+    "ndcg, recall = (\n",
+    "    all_ndcgs.copy(),\n",
+    "    all_recalls.copy(),\n",
+    ")\n",
+    "\n",
+    "for k in [10,100]:\n",
+    "    ndcg[f\"NDCG@{k}\"] = round(sum(ndcg[f\"NDCG@{k}\"]) / len(scores), 5)\n",
+    "    recall[f\"Recall@{k}\"] = round(sum(recall[f\"Recall@{k}\"]) / len(scores), 5)\n",
+    "\n",
+    "print(ndcg)\n",
+    "print(recall)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Evaluate using FlagEmbedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We provide independent evaluation for popular datasets and benchmarks. Try the following code to run the evaluation, or run the shell script provided in [example](../../examples/evaluation/mldr/eval_mldr.sh) folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "arguments = \"\"\"- \\\n",
+    "    --eval_name mldr \\\n",
+    "    --dataset_dir ./mldr/data \\\n",
+    "    --dataset_names en \\\n",
+    "    --splits dev \\\n",
+    "    --corpus_embd_save_dir ./mldr/corpus_embd \\\n",
+    "    --output_dir ./mldr/search_results \\\n",
+    "    --search_top_k 1000 \\\n",
+    "    --cache_path ./cache/data \\\n",
+    "    --overwrite False \\\n",
+    "    --k_values 10 100 \\\n",
+    "    --eval_output_method markdown \\\n",
+    "    --eval_output_path ./mldr/mldr_eval_results.md \\\n",
+    "    --eval_metrics ndcg_at_10 \\\n",
+    "    --embedder_name_or_path BAAI/bge-base-en-v1.5 \\\n",
+    "    --devices cuda:0 cuda:1 \\\n",
+    "    --embedder_batch_size 1024\n",
+    "\"\"\".replace('\\n','')\n",
+    "\n",
+    "sys.argv = arguments.split()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/anaconda3/envs/dev/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "initial target device: 100%|██████████| 2/2 [00:07<00:00,  3.54s/it]\n",
+      "pre tokenize: 100%|██████████| 98/98 [01:01<00:00,  1.58it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "pre tokenize: 100%|██████████| 98/98 [01:07<00:00,  1.44it/s]09it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "Inference Embeddings: 100%|██████████| 98/98 [01:22<00:00,  1.19it/s]\n",
+      "Inference Embeddings: 100%|██████████| 98/98 [01:23<00:00,  1.17it/s]\n",
+      "Chunks: 100%|██████████| 2/2 [02:40<00:00, 80.21s/it] \n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00,  2.16it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00,  2.21it/s]\n",
+      "Chunks: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]\n",
+      "Searching: 100%|██████████| 7/7 [00:01<00:00,  6.79it/s]\n",
+      "Qrels not found in ./mldr/data/en/dev_qrels.jsonl. Trying to download the qrels from the remote and save it to ./mldr/data/en.\n",
+      "Loading and Saving qrels: 100%|██████████| 200/200 [00:00<00:00, 598.03it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import HfArgumentParser\n",
+    "\n",
+    "from FlagEmbedding.evaluation.mldr import (\n",
+    "    MLDREvalArgs, MLDREvalModelArgs,\n",
+    "    MLDREvalRunner\n",
+    ")\n",
+    "\n",
+    "\n",
+    "parser = HfArgumentParser((\n",
+    "    MLDREvalArgs,\n",
+    "    MLDREvalModelArgs\n",
+    "))\n",
+    "\n",
+    "eval_args, model_args = parser.parse_args_into_dataclasses()\n",
+    "eval_args: MLDREvalArgs\n",
+    "model_args: MLDREvalModelArgs\n",
+    "\n",
+    "runner = MLDREvalRunner(\n",
+    "    eval_args=eval_args,\n",
+    "    model_args=model_args\n",
+    ")\n",
+    "\n",
+    "runner.run()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "    \"en-dev\": {\n",
+      "        \"ndcg_at_10\": 0.35304,\n",
+      "        \"ndcg_at_100\": 0.38694,\n",
+      "        \"map_at_10\": 0.31783,\n",
+      "        \"map_at_100\": 0.32469,\n",
+      "        \"recall_at_10\": 0.465,\n",
+      "        \"recall_at_100\": 0.625,\n",
+      "        \"precision_at_10\": 0.0465,\n",
+      "        \"precision_at_100\": 0.00625,\n",
+      "        \"mrr_at_10\": 0.31783,\n",
+      "        \"mrr_at_100\": 0.32469\n",
+      "    }\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open('mldr/search_results/bge-base-en-v1.5/NoReranker/EVAL/eval_results.json', 'r') as content_file:\n",
+    "    print(content_file.read())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/5_Reranking.rst b/docs/source/tutorial/5_Reranking.rst
index 9f9a8cbc..a14e1ab8 100644
--- a/docs/source/tutorial/5_Reranking.rst
+++ b/docs/source/tutorial/5_Reranking.rst
@@ -6,4 +6,6 @@
    :maxdepth: 1
    :caption: Reranking
 
-   5_Reranking/5.1
\ No newline at end of file
+   5_Reranking/5.1
+   5_Reranking/5.2
+   5_Reranking/5.3
\ No newline at end of file
diff --git a/docs/source/tutorial/5_Reranking/5.2.ipynb b/docs/source/tutorial/5_Reranking/5.2.ipynb
new file mode 100644
index 00000000..7a41d8f3
--- /dev/null
+++ b/docs/source/tutorial/5_Reranking/5.2.ipynb
@@ -0,0 +1,380 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# BGE Reranker"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Like embedding models, BGE has a group of rerankers with various sizes and functionalities. In this tutorial, we will introduce the BGE rerankers series."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install the dependencies in the environment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -U FlagEmbedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. bge-reranker"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The first generation of BGE reranker contains two models:\n",
+    "\n",
+    "| Model  | Language |   Parameters   |    Description    |   Base Model     |\n",
+    "|:-------|:--------:|:----:|:-----------------:|:--------------------------------------:|\n",
+    "| [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base)   |   Chinese and English |     278M     |  a cross-encoder model which is more accurate but less efficient     |  XLM-RoBERTa-Base  |\n",
+    "| [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large) |   Chinese and English |     560M     |   a cross-encoder model which is more accurate but less efficient    |  XLM-RoBERTa-Large  |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[7.984375, -6.84375, -7.15234375, 5.44921875]"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import FlagReranker\n",
+    "\n",
+    "model = FlagReranker(\n",
+    "    'BAAI/bge-reranker-large',\n",
+    "    use_fp16=True,\n",
+    "    devices=[\"cuda:0\"],   # if you don't have GPUs, you can use \"cpu\"\n",
+    ")\n",
+    "\n",
+    "pairs = [\n",
+    "    [\"What is the capital of France?\", \"Paris is the capital of France.\"],\n",
+    "    [\"What is the capital of France?\", \"The population of China is over 1.4 billion people.\"],\n",
+    "    [\"What is the population of China?\", \"Paris is the capital of France.\"],\n",
+    "    [\"What is the population of China?\", \"The population of China is over 1.4 billion people.\"]\n",
+    "]\n",
+    "\n",
+    "scores = model.compute_score(pairs)\n",
+    "scores"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. bge-reranker v2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "| Model  | Language |   Parameters   |    Description    |   Base Model     |\n",
+    "|:-------|:--------:|:----:|:-----------------:|:--------------------------------------:|\n",
+    "| [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) | Multilingual |     568M     | a lightweight cross-encoder model, possesses strong multilingual capabilities, easy to deploy, with fast inference. | XLM-RoBERTa-Large |\n",
+    "| [BAAI/bge-reranker-v2-gemma](https://huggingface.co/BAAI/bge-reranker-v2-gemma) | Multilingual |     2.51B     | a cross-encoder model which is suitable for multilingual contexts, performs well in both English proficiency and multilingual capabilities. | Gemma2-2B |\n",
+    "| [BAAI/bge-reranker-v2-minicpm-layerwise](https://huggingface.co/BAAI/bge-reranker-v2-minicpm-layerwise) | Multilingual |    2.72B    | a cross-encoder model which is suitable for multilingual contexts, performs well in both English and Chinese proficiency, allows freedom to select layers for output, facilitating accelerated inference. | MiniCPM |\n",
+    "| [BAAI/bge-reranker-v2.5-gemma2-lightweight](https://huggingface.co/BAAI/bge-reranker-v2.5-gemma2-lightweight) | Multilingual |    9.24B    | a cross-encoder model which is suitable for multilingual contexts, performs well in both English and Chinese proficiency, allows freedom to select layers, compress ratio and compress layers for output, facilitating accelerated inference. | Gemma2-9B |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### bge-reranker-v2-m3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "bge-reranker-v2-m3 is trained based on bge-m3, introducing great multi-lingual capability as keeping a slim model size."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0.003483424193080668]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import FlagReranker\n",
+    "\n",
+    "# Setting use_fp16 to True speeds up computation with a slight performance degradation (if using gpu)\n",
+    "reranker = FlagReranker('BAAI/bge-reranker-v2-m3', devices=[\"cuda:0\"], use_fp16=True)\n",
+    "\n",
+    "score = reranker.compute_score(['query', 'passage'])\n",
+    "# or set \"normalize=True\" to apply a sigmoid function to the score for 0-1 range\n",
+    "score = reranker.compute_score(['query', 'passage'], normalize=True)\n",
+    "\n",
+    "print(score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### bge-reranker-v2-gemma"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "bge-reranker-v2-gemma is trained based on gemma-2b. It has excellent performances with both English proficiency and multilingual capabilities."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  5.29it/s]\n",
+      "You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|██████████| 1/1 [00:00<00:00, 45.99it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1.974609375]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import FlagLLMReranker\n",
+    "\n",
+    "reranker = FlagLLMReranker('BAAI/bge-reranker-v2-gemma', devices=[\"cuda:0\"], use_fp16=True)\n",
+    "\n",
+    "score = reranker.compute_score(['query', 'passage'])\n",
+    "print(score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### bge-reranker-v2-minicpm-layerwise"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "bge-reranker-v2-minicpm-layerwise is trained based on minicpm-2b-dpo-bf16. It's suitable for multi-lingual contexts, performs well in Both English and Chinese proficiency.\n",
+    "\n",
+    "Another special functionality is the layerwise design gives user freedom to select layers for output, facilitating accelerated inference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  3.85it/s]\n",
+      "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|██████████| 1/1 [00:00<00:00, 24.51it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-7.06640625]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import LayerWiseFlagLLMReranker\n",
+    "\n",
+    "reranker = LayerWiseFlagLLMReranker('BAAI/bge-reranker-v2-minicpm-layerwise', devices=[\"cuda:0\"], use_fp16=True)\n",
+    "\n",
+    "# Adjusting 'cutoff_layers' to pick which layers are used for computing the score.\n",
+    "score = reranker.compute_score(['query', 'passage'], cutoff_layers=[28])\n",
+    "print(score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### bge-reranker-v2.5-gemma2-lightweight"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "bge-reranker-v2.5-gemma2-lightweight is trained based on gemma2-9b. It's also suitable for multi-lingual contexts.\n",
+    "\n",
+    "Besides the layerwise reduction functionality, bge-reranker-v2.5-gemma2-lightweight integrates token compression capabilities to further save more resources while maintaining outstanding performances."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  3.60it/s]\n",
+      "You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "100%|██████████| 1/1 [00:00<00:00, 23.95it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[14.734375]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from FlagEmbedding import LightWeightFlagLLMReranker\n",
+    "\n",
+    "reranker = LightWeightFlagLLMReranker('BAAI/bge-reranker-v2.5-gemma2-lightweight', devices=[\"cuda:0\"], use_fp16=True)\n",
+    "\n",
+    "# Adjusting 'cutoff_layers' to pick which layers are used for computing the score.\n",
+    "score = reranker.compute_score(['query', 'passage'], cutoff_layers=[28], compress_ratio=2, compress_layers=[24, 40])\n",
+    "print(score)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Comparison"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "BGE reranker series provides a great number of choices for all kinds of functionalities. You can select the model according your senario and resource:\n",
+    "\n",
+    "- For multilingual, utilize `BAAI/bge-reranker-v2-m3`, `BAAI/bge-reranker-v2-gemma` and `BAAI/bge-reranker-v2.5-gemma2-lightweight`.\n",
+    "\n",
+    "- For Chinese or English, utilize `BAAI/bge-reranker-v2-m3` and `BAAI/bge-reranker-v2-minicpm-layerwise`.\n",
+    "\n",
+    "- For efficiency, utilize `BAAI/bge-reranker-v2-m3` and the low layer of `BAAI/bge-reranker-v2-minicpm-layerwise`.\n",
+    "\n",
+    "- For saving resources and extreme efficiency, utilize `BAAI/bge-reranker-base` and `BAAI/bge-reranker-large`.\n",
+    "\n",
+    "- For better performance, recommand `BAAI/bge-reranker-v2-minicpm-layerwise` and B`AAI/bge-reranker-v2-gemma`.\n",
+    "\n",
+    "Make sure always test on your real use case and choose the one with best speed-quality balance!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ft",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/5_Reranking/5.3.ipynb b/docs/source/tutorial/5_Reranking/5.3.ipynb
new file mode 100644
index 00000000..3d74e2cf
--- /dev/null
+++ b/docs/source/tutorial/5_Reranking/5.3.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate Reranker"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Reranker usually better captures the latent semantic meanings between sentences. But comparing to using an embedding model, it will take quadratic $O(N^2)$ running time for the whole dataset. Thus the most common use cases of rerankers in information retrieval or RAG is reranking the top k answers retrieved according to the embedding similarities.\n",
+    "\n",
+    "The evaluation of reranker has the similar idea. We compare how much better the rerankers can rerank the candidates searched by a same embedder. In this tutorial, we will evaluate two rerankers' performances on BEIR benchmark, with bge-large-en-v1.5 as the base embedding model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note: We highly recommend to run this notebook with GPU. The whole pipeline is very time consuming. For simplicity, we only use a single task FiQA in BEIR."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First install the required dependency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install FlagEmbedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. bge-reranker-large"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The first model is bge-reranker-large, a BERT like reranker with about 560M parameters.\n",
+    "\n",
+    "We can use the evaluation pipeline of FlagEmbedding to directly run the whole process:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Split 'dev' not found in the dataset. Removing it from the list.\n",
+      "ignore_identical_ids is set to True. This means that the search results will not contain identical ids. Note: Dataset such as MIRACL should NOT set this to True.\n",
+      "pre tokenize: 100%|██████████| 57/57 [00:03<00:00, 14.68it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n",
+      "Inference Embeddings: 100%|██████████| 57/57 [00:44<00:00,  1.28it/s]\n",
+      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 61.59it/s]\n",
+      "Inference Embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.22it/s]\n",
+      "Searching: 100%|██████████| 21/21 [00:00<00:00, 68.26it/s]\n",
+      "pre tokenize:   0%|          | 0/64 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "pre tokenize: 100%|██████████| 64/64 [00:08<00:00,  7.15it/s]\n",
+      "Compute Scores: 100%|██████████| 64/64 [01:39<00:00,  1.56s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "python -m FlagEmbedding.evaluation.beir \\\n",
+    "--eval_name beir \\\n",
+    "--dataset_dir ./beir/data \\\n",
+    "--dataset_names fiqa \\\n",
+    "--splits test dev \\\n",
+    "--corpus_embd_save_dir ./beir/corpus_embd \\\n",
+    "--output_dir ./beir/search_results \\\n",
+    "--search_top_k 1000 \\\n",
+    "--rerank_top_k 100 \\\n",
+    "--cache_path /root/.cache/huggingface/hub \\\n",
+    "--overwrite True \\\n",
+    "--k_values 10 100 \\\n",
+    "--eval_output_method markdown \\\n",
+    "--eval_output_path ./beir/beir_eval_results.md \\\n",
+    "--eval_metrics ndcg_at_10 recall_at_100 \\\n",
+    "--ignore_identical_ids True \\\n",
+    "--embedder_name_or_path BAAI/bge-large-en-v1.5 \\\n",
+    "--reranker_name_or_path BAAI/bge-reranker-large \\\n",
+    "--embedder_batch_size 1024 \\\n",
+    "--reranker_batch_size 1024 \\\n",
+    "--devices cuda:0 \\"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. bge-reranker-v2-gemma"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The second model is bge-reranker-v2-m3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Split 'dev' not found in the dataset. Removing it from the list.\n",
+      "ignore_identical_ids is set to True. This means that the search results will not contain identical ids. Note: Dataset such as MIRACL should NOT set this to True.\n",
+      "initial target device: 100%|██████████| 4/4 [01:14<00:00, 18.51s/it]\n",
+      "pre tokenize: 100%|██████████| 15/15 [00:01<00:00, 11.21it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "pre tokenize: 100%|██████████| 15/15 [00:01<00:00, 11.32it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "pre tokenize: 100%|██████████| 15/15 [00:01<00:00, 10.29it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "pre tokenize: 100%|██████████| 15/15 [00:01<00:00, 13.99it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n",
+      "Inference Embeddings: 100%|██████████| 15/15 [00:12<00:00,  1.24it/s]\n",
+      "Inference Embeddings: 100%|██████████| 15/15 [00:12<00:00,  1.23it/s]\n",
+      "Inference Embeddings: 100%|██████████| 15/15 [00:12<00:00,  1.22it/s]\n",
+      "Inference Embeddings: 100%|██████████| 15/15 [00:12<00:00,  1.21it/s]\n",
+      "Chunks: 100%|██████████| 4/4 [00:30<00:00,  7.70s/it]\n",
+      "Chunks: 100%|██████████| 4/4 [00:00<00:00, 47.90it/s]\n",
+      "Searching: 100%|██████████| 21/21 [00:00<00:00, 128.34it/s]\n",
+      "initial target device: 100%|██████████| 4/4 [01:09<00:00, 17.43s/it]\n",
+      "pre tokenize:   0%|          | 0/16 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "pre tokenize:  12%|█▎        | 2/16 [00:00<00:02,  6.46it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "pre tokenize:  12%|█▎        | 2/16 [00:00<00:03,  4.60it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "pre tokenize:  25%|██▌       | 4/16 [00:00<00:02,  4.61it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "pre tokenize: 100%|██████████| 16/16 [00:03<00:00,  4.12it/s]\n",
+      "pre tokenize: 100%|██████████| 16/16 [00:04<00:00,  3.78it/s]\n",
+      "pre tokenize: 100%|██████████| 16/16 [00:04<00:00,  3.95it/s]\n",
+      "pre tokenize: 100%|██████████| 16/16 [00:04<00:00,  3.81it/s]\n",
+      "Compute Scores: 100%|██████████| 67/67 [00:29<00:00,  2.30it/s]\n",
+      "Compute Scores: 100%|██████████| 67/67 [00:29<00:00,  2.27it/s]\n",
+      "Compute Scores: 100%|██████████| 67/67 [00:29<00:00,  2.27it/s]\n",
+      "Compute Scores: 100%|██████████| 67/67 [00:30<00:00,  2.19it/s]\n",
+      "Chunks: 100%|██████████| 4/4 [00:51<00:00, 12.97s/it]\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/multiprocessing/resource_tracker.py:254: UserWarning: resource_tracker: There appear to be 8 leaked semaphore objects to clean up at shutdown\n",
+      "  warnings.warn('resource_tracker: There appear to be %d '\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "python -m FlagEmbedding.evaluation.beir \\\n",
+    "--eval_name beir \\\n",
+    "--dataset_dir ./beir/data \\\n",
+    "--dataset_names fiqa \\\n",
+    "--splits test dev \\\n",
+    "--corpus_embd_save_dir ./beir/corpus_embd \\\n",
+    "--output_dir ./beir/search_results \\\n",
+    "--search_top_k 1000 \\\n",
+    "--rerank_top_k 100 \\\n",
+    "--cache_path /root/.cache/huggingface/hub \\\n",
+    "--overwrite True \\\n",
+    "--k_values 10 100 \\\n",
+    "--eval_output_method markdown \\\n",
+    "--eval_output_path ./beir/beir_eval_results.md \\\n",
+    "--eval_metrics ndcg_at_10 recall_at_100 \\\n",
+    "--ignore_identical_ids True \\\n",
+    "--embedder_name_or_path BAAI/bge-large-en-v1.5 \\\n",
+    "--reranker_name_or_path BAAI/bge-reranker-v2-m3 \\\n",
+    "--embedder_batch_size 1024 \\\n",
+    "--reranker_batch_size 1024 \\\n",
+    "--devices cuda:0 cuda:1 cuda:2 cuda:3 \\\n",
+    "--reranker_max_length 1024 \\"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Comparison"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'fiqa-test': {'ndcg_at_10': 0.40991, 'ndcg_at_100': 0.48028, 'map_at_10': 0.32127, 'map_at_100': 0.34227, 'recall_at_10': 0.50963, 'recall_at_100': 0.75987, 'precision_at_10': 0.11821, 'precision_at_100': 0.01932, 'mrr_at_10': 0.47786, 'mrr_at_100': 0.4856}}\n",
+      "{'fiqa-test': {'ndcg_at_10': 0.44828, 'ndcg_at_100': 0.51525, 'map_at_10': 0.36551, 'map_at_100': 0.38578, 'recall_at_10': 0.519, 'recall_at_100': 0.75987, 'precision_at_10': 0.12299, 'precision_at_100': 0.01932, 'mrr_at_10': 0.53382, 'mrr_at_100': 0.54108}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "with open('beir/search_results/bge-large-en-v1.5/bge-reranker-large/EVAL/eval_results.json') as f:\n",
+    "    results_1 = json.load(f)\n",
+    "    print(results_1)\n",
+    "    \n",
+    "with open('beir/search_results/bge-large-en-v1.5/bge-reranker-v2-m3/EVAL/eval_results.json') as f:\n",
+    "    results_2 = json.load(f)\n",
+    "    print(results_2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From the above results we can see that bge-reranker-v2-m3 has advantage on almost all the metrics."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ft",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/7_Finetuning.rst b/docs/source/tutorial/7_Finetuning.rst
new file mode 100644
index 00000000..ddcd6bf1
--- /dev/null
+++ b/docs/source/tutorial/7_Finetuning.rst
@@ -0,0 +1,11 @@
+6. Finetuning
+=============
+
+.. toctree::
+   :hidden:
+   :maxdepth: 1
+   :caption: Finetuning
+
+   7_Finetuning/7.1.1
+   7_Finetuning/7.1.2
+   7_Finetuning/7.1.3
\ No newline at end of file
diff --git a/docs/source/tutorial/7_Finetuning/7.1.1.ipynb b/docs/source/tutorial/7_Finetuning/7.1.1.ipynb
new file mode 100644
index 00000000..72b9af23
--- /dev/null
+++ b/docs/source/tutorial/7_Finetuning/7.1.1.ipynb
@@ -0,0 +1,468 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Preparation for Fine-tuning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial, we will show an example of the first step for fine-tuning: dataset preparation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Installation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "% pip install -U datasets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Suppose we are willing to fine-tune our model for financial tasks. We found an open-source dataset that could be useful: [financial-qa-10k](https://huggingface.co/datasets/virattt/financial-qa-10K). Let's see how to properly prepare our dataset for fine-tuning."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The raw dataset has the following structure:\n",
+    "- 5 columns of: 'question', 'answer', 'context', 'ticker', and 'filing'.\n",
+    "- 7000 rows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['question', 'answer', 'context', 'ticker', 'filing'],\n",
+       "    num_rows: 7000\n",
+       "})"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "ds = load_dataset(\"virattt/financial-qa-10K\", split=\"train\")\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Data for Fine-tuning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Construct the dataset to the following format:\n",
+    "\n",
+    "``` python\n",
+    "{\"query\": str, \"pos\": List[str], \"neg\":List[str], \"pos_scores\": List[int], \"neg_scores\": List[int], \"prompt\": str, \"type\": str}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`query` is the query, and `pos` is a list of positive texts, `neg` is a list of negative texts. `pos_scores` is a list of scores corresponding to the query and pos, `neg_scores` is a list of scores corresponding to the `query` and `neg`, if you don't use knowledge distillation, it can be ignored. `prompt` is the prompt used for the query, it will cover query_instruction_for_retrieval. `type` is used for bge-en-icl, it includes `normal`, `symmetric_class`, `symmetric_clustering`, .etc. If you have no negative texts for a query, you can random sample some from the entire corpus as the negatives."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We select the columns 'question' and 'context' as our query and answer(pos), and rename the columns. Then add the 'id' column for later evaluation use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'query': 'What area did NVIDIA initially focus on before expanding to other computationally intensive fields?',\n",
+       " 'pos': 'Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields.',\n",
+       " 'id': '0'}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds = ds.select_columns(column_names=[\"question\", \"context\"])\n",
+    "ds = ds.rename_column(\"question\", \"query\")\n",
+    "ds = ds.rename_column(\"context\", \"pos\")\n",
+    "ds = ds.add_column(\"id\", [str(i) for i in range(len(ds))])\n",
+    "ds[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Negative examples are important during the training of embedding models. Our initial dataset does not come with negative texts. Thus we directly sample a few from the whole corpus."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 7000/7000 [00:00<00:00, 22336.83 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "np.random.seed(520)\n",
+    "neg_num = 10\n",
+    "\n",
+    "def str_to_lst(data):\n",
+    "    data[\"pos\"] = [data[\"pos\"]]\n",
+    "    return data\n",
+    "\n",
+    "# sample negative texts\n",
+    "new_col = []\n",
+    "for i in range(len(ds)):\n",
+    "    ids = np.random.randint(0, len(ds), size=neg_num)\n",
+    "    while i in ids:\n",
+    "        ids = np.random.randint(0, len(ds), size=neg_num)\n",
+    "    neg = [ds[i.item()][\"pos\"] for i in ids]\n",
+    "    new_col.append(neg)\n",
+    "ds = ds.add_column(\"neg\", new_col)\n",
+    "\n",
+    "# change the key of 'pos' to a list\n",
+    "ds = ds.map(str_to_lst)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lastly, we add the prompt which is used for query. It will be the `query_instruction_for_retrieval` during inference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "instruction = \"Represent this sentence for searching relevant passages: \"\n",
+    "ds = ds.add_column(\"prompt\", [instruction]*len(ds))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now a single row of the dataset is:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'query': 'What area did NVIDIA initially focus on before expanding to other computationally intensive fields?',\n",
+       " 'pos': ['Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields.'],\n",
+       " 'id': '0',\n",
+       " 'neg': ['Kroger expects that its value creation model will deliver total shareholder return within a target range of 8% to 11% over time.',\n",
+       "  'CSB purchased First Mortgages of $2.9 billion during 2023.',\n",
+       "  'See Note 13 to our Consolidated Financial Statements for information on certain legal proceedings for which there are contingencies.',\n",
+       "  'Diluted earnings per share were $16.69 in fiscal 2022 compared to $15.53 in fiscal 2021.',\n",
+       "  'In the year ended December 31, 2023, Total net sales and revenue increased primarily due to: (1) increased net wholesale volumes primarily due to increased sales of crossover vehicles and full-size pickup trucks, partially offset by decreased sales of mid-size pickup trucks; (2) favorable Price as a result of low dealer inventory levels and strong demand for our products; (3) favorable Mix associated with increased sales of full-size pickup trucks and full-size SUVs and decreased sales of vans, passenger cars and mid-size pickup trucks, partially offset by increased sales of crossover vehicles; and (4) favorable Other due to increased sales of parts and accessories.',\n",
+       "  'As of December 31, 2023, we had 3,157 full-time employees.',\n",
+       "  'Item 3. Legal Proceedings. The information contained in Note 18 ‘‘Commitments and Contingencies’’ included in Item 8 of this 10-K is incorporated herein by reference.',\n",
+       "  'Under the amended 2019 Secured Facility, the maturity date is set to July 20, 2026.',\n",
+       "  'Accounts receivable for Las Vegas Sands Corp. on December 31, 2023, totaled $685 million, with a provision for credit losses of $201 million, resulting in a net balance of $484 million.',\n",
+       "  'Operating expenses as a percentage of segment net sales decreased 25 basis points for fiscal 2023 when compared to the previous fiscal year, primarily driven by strong sales growth and lower incremental COVID-19 related costs, partially offset by increased wage costs.'],\n",
+       " 'prompt': 'Represent this sentence for searching relevant passages: '}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we split the dataset into training set and testing set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "split = ds.train_test_split(test_size=0.1, shuffle=True, seed=520)\n",
+    "train = split[\"train\"]\n",
+    "test = split[\"test\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we are ready to store the data for later fine-tuning:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Creating json from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 39.73ba/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "16583481"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train.to_json(\"ft_data/training.json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Test Data for Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The last step is to construct the testing dataset for evaluaton."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['query', 'pos', 'id', 'neg', 'prompt'],\n",
+       "    num_rows: 700\n",
+       "})"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First select the columns for queries:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': '1289',\n",
+       " 'text': 'How does Starbucks recognize the interest and penalties related to income tax matters on their financial statements?'}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "queries = test.select_columns(column_names=[\"id\", \"query\"])\n",
+    "queries = queries.rename_column(\"query\", \"text\")\n",
+    "queries[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then select the columns for corpus:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "corpus = ds.select_columns(column_names=[\"id\", \"pos\"])\n",
+    "corpus = corpus.rename_column(\"pos\", \"text\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, make the qrels that indicating the relations of queries and corresponding corpus\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Flattening the indices: 100%|██████████| 700/700 [00:00<00:00, 180956.10 examples/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'qid': '1289', 'docid': '1289', 'relevance': 1}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "qrels = test.select_columns([\"id\"])\n",
+    "qrels = qrels.rename_column(\"id\", \"qid\")\n",
+    "qrels = qrels.add_column(\"docid\", list(test[\"id\"]))\n",
+    "qrels = qrels.add_column(\"relevance\", [1]*len(test))\n",
+    "qrels[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Store the training set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 210.42ba/s]\n",
+      "Creating json from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 261.19ba/s]\n",
+      "Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 591.08ba/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "30574"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "queries.to_json(\"ft_data/test_queries.jsonl\")\n",
+    "corpus.to_json(\"ft_data/corpus.jsonl\")\n",
+    "qrels.to_json(\"ft_data/test_qrels.jsonl\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ft",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/7_Finetuning/7.1.2.ipynb b/docs/source/tutorial/7_Finetuning/7.1.2.ipynb
new file mode 100644
index 00000000..c8025630
--- /dev/null
+++ b/docs/source/tutorial/7_Finetuning/7.1.2.ipynb
@@ -0,0 +1,3734 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-tuning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the previous section, we went through how to construct training and testing data properly. In this tutorial, we will actually fine-tune the model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note to fine-tune BGE models using FlagEmbedding, we need to install the package with the finetune dependency:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "% pip install -U FlagEmbedding[finetune]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fine-tune"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below are the arguments for fine-tuning:\n",
+    "\n",
+    "The following arguments are for model:\n",
+    "- `model_name_or_path`: The model checkpoint for initialization.\n",
+    "- `config_name`: Pretrained config name or path if not the same as model_name.\n",
+    "- `tokenizer_name`: Pretrained tokenizer name or path if not the same as model_name.\n",
+    "- `cache_dir`: Where do you want to store the pre-trained models downloaded from s3.\n",
+    "- `trust_remote_code`: Trust remote code\n",
+    "- `token`: The token to use when accessing the model.\n",
+    "\n",
+    "The following arguments are for data:\n",
+    "- `train_data`: One or more paths to training data. `query: str`, `pos: List[str]`, `neg: List[str]` are required in the training data. Argument type: multiple.\n",
+    "- `cache_path`: Where do you want to store the cached data.\n",
+    "- `train_group_size`: (No metadata provided)\n",
+    "- `query_max_len`: The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated.\n",
+    "- `passage_max_len`: The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated.\n",
+    "- `pad_to_multiple_of`: If set will pad the sequence to be a multiple of the provided value.\n",
+    "- `max_example_num_per_dataset`: The max number of examples for each dataset.\n",
+    "- `query_instruction_for_retrieval`: Instruction for query.\n",
+    "- `query_instruction_format`: Format for query instruction.\n",
+    "- `knowledge_distillation`: Use knowledge distillation when `pos_scores: List[float]` and `neg_scores: List[float]` are in features of training data.\n",
+    "- `passage_instruction_for_retrieval`: Instruction for passage.\n",
+    "- `passage_instruction_format`: Format for passage instruction.\n",
+    "- `shuffle_ratio`: The ratio of shuffling the text.\n",
+    "- `same_dataset_within_batch`: All samples in the same batch comes from the same dataset.\n",
+    "- `small_threshold`: The threshold of small dataset. All small dataset in the same directory will be merged into one dataset.\n",
+    "- `drop_threshold`: The threshold for dropping merged small dataset. If the number of examples in the merged small dataset is less than this threshold, it will be dropped.\n",
+    "\n",
+    "And the following extra arguments:\n",
+    "- `negatives_cross_device`: Share negatives across devices.\n",
+    "- `temperature`: Temperature used for similarity score.\n",
+    "- `fix_position_embedding`: Freeze the parameters of position embeddings.\n",
+    "- `sentence_pooling_method`: The pooling method. Available options: cls, mean, last_token. Default: cls.\n",
+    "- `normalize_embeddings`: Whether to normalize the embeddings.\n",
+    "- `sub_batch_size`: Sub batch size for training.\n",
+    "- `kd_loss_type`: The loss type for knowledge distillation. Available options: kl_div, m3_kd_loss. Default: kl_div."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "W1223 06:27:06.807000 1362426 site-packages/torch/distributed/run.py:793] \n",
+      "W1223 06:27:06.807000 1362426 site-packages/torch/distributed/run.py:793] *****************************************\n",
+      "W1223 06:27:06.807000 1362426 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n",
+      "W1223 06:27:06.807000 1362426 site-packages/torch/distributed/run.py:793] *****************************************\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-23 06:27:31,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-12-23 06:27:31,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-12-23 06:27:40,529] [INFO] [comm.py:652:init_distributed] cdb=None\n",
+      "[2024-12-23 06:27:40,529] [INFO] [comm.py:652:init_distributed] cdb=None\n",
+      "[2024-12-23 06:27:40,529] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "12/23/2024 06:27:40 - WARNING - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: True\n",
+      "12/23/2024 06:27:40 - INFO - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Training/evaluation parameters AbsEmbedderTrainingArguments(\n",
+      "_n_gpu=1,\n",
+      "accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},\n",
+      "adafactor=False,\n",
+      "adam_beta1=0.9,\n",
+      "adam_beta2=0.999,\n",
+      "adam_epsilon=1e-08,\n",
+      "auto_find_batch_size=False,\n",
+      "batch_eval_metrics=False,\n",
+      "bf16=False,\n",
+      "bf16_full_eval=False,\n",
+      "data_seed=None,\n",
+      "dataloader_drop_last=True,\n",
+      "dataloader_num_workers=0,\n",
+      "dataloader_persistent_workers=False,\n",
+      "dataloader_pin_memory=True,\n",
+      "dataloader_prefetch_factor=None,\n",
+      "ddp_backend=None,\n",
+      "ddp_broadcast_buffers=None,\n",
+      "ddp_bucket_cap_mb=None,\n",
+      "ddp_find_unused_parameters=None,\n",
+      "ddp_timeout=1800,\n",
+      "debug=[],\n",
+      "deepspeed=config/ds_stage0.json,\n",
+      "disable_tqdm=False,\n",
+      "dispatch_batches=None,\n",
+      "do_eval=False,\n",
+      "do_predict=False,\n",
+      "do_train=False,\n",
+      "eval_accumulation_steps=None,\n",
+      "eval_delay=0,\n",
+      "eval_do_concat_batches=True,\n",
+      "eval_on_start=False,\n",
+      "eval_steps=None,\n",
+      "eval_strategy=IntervalStrategy.NO,\n",
+      "eval_use_gather_object=False,\n",
+      "evaluation_strategy=None,\n",
+      "fix_position_embedding=False,\n",
+      "fp16=True,\n",
+      "fp16_backend=auto,\n",
+      "fp16_full_eval=False,\n",
+      "fp16_opt_level=O1,\n",
+      "fsdp=[],\n",
+      "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},\n",
+      "fsdp_min_num_params=0,\n",
+      "fsdp_transformer_layer_cls_to_wrap=None,\n",
+      "full_determinism=False,\n",
+      "gradient_accumulation_steps=1,\n",
+      "gradient_checkpointing=True,\n",
+      "gradient_checkpointing_kwargs=None,\n",
+      "greater_is_better=None,\n",
+      "group_by_length=False,\n",
+      "half_precision_backend=auto,\n",
+      "hub_always_push=False,\n",
+      "hub_model_id=None,\n",
+      "hub_private_repo=False,\n",
+      "hub_strategy=HubStrategy.EVERY_SAVE,\n",
+      "hub_token=<HUB_TOKEN>,\n",
+      "ignore_data_skip=False,\n",
+      "include_inputs_for_metrics=False,\n",
+      "include_num_input_tokens_seen=False,\n",
+      "include_tokens_per_second=False,\n",
+      "jit_mode_eval=False,\n",
+      "kd_loss_type=kl_div,\n",
+      "label_names=None,\n",
+      "label_smoothing_factor=0.0,\n",
+      "learning_rate=1e-05,\n",
+      "length_column_name=length,\n",
+      "load_best_model_at_end=False,\n",
+      "local_rank=0,\n",
+      "log_level=passive,\n",
+      "log_level_replica=warning,\n",
+      "log_on_each_node=True,\n",
+      "logging_dir=./test_encoder_only_base_bge-large-en-v1.5/runs/Dec23_06-27-30_job-40fb0ce3-8bfb-46ea-b409-0a2e2a1a3163-master-0,\n",
+      "logging_first_step=False,\n",
+      "logging_nan_inf_filter=True,\n",
+      "logging_steps=1.0,\n",
+      "logging_strategy=IntervalStrategy.STEPS,\n",
+      "lr_scheduler_kwargs={},\n",
+      "lr_scheduler_type=SchedulerType.LINEAR,\n",
+      "max_grad_norm=1.0,\n",
+      "max_steps=-1,\n",
+      "metric_for_best_model=None,\n",
+      "mp_parameters=,\n",
+      "neftune_noise_alpha=None,\n",
+      "negatives_cross_device=True,\n",
+      "no_cuda=False,\n",
+      "normalize_embeddings=True,\n",
+      "num_train_epochs=2.0,\n",
+      "optim=OptimizerNames.ADAMW_TORCH,\n",
+      "optim_args=None,\n",
+      "optim_target_modules=None,\n",
+      "output_dir=./test_encoder_only_base_bge-large-en-v1.5,\n",
+      "overwrite_output_dir=True,\n",
+      "past_index=-1,\n",
+      "per_device_eval_batch_size=8,\n",
+      "per_device_train_batch_size=2,\n",
+      "prediction_loss_only=False,\n",
+      "push_to_hub=False,\n",
+      "push_to_hub_model_id=None,\n",
+      "push_to_hub_organization=None,\n",
+      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
+      "ray_scope=last,\n",
+      "remove_unused_columns=True,\n",
+      "report_to=[],\n",
+      "restore_callback_states_from_checkpoint=False,\n",
+      "resume_from_checkpoint=None,\n",
+      "run_name=./test_encoder_only_base_bge-large-en-v1.5,\n",
+      "save_on_each_node=False,\n",
+      "save_only_model=False,\n",
+      "save_safetensors=True,\n",
+      "save_steps=1000,\n",
+      "save_strategy=IntervalStrategy.STEPS,\n",
+      "save_total_limit=None,\n",
+      "seed=42,\n",
+      "sentence_pooling_method=cls,\n",
+      "skip_memory_metrics=True,\n",
+      "split_batches=None,\n",
+      "sub_batch_size=None,\n",
+      "temperature=0.02,\n",
+      "tf32=None,\n",
+      "torch_compile=False,\n",
+      "torch_compile_backend=None,\n",
+      "torch_compile_mode=None,\n",
+      "torch_empty_cache_steps=None,\n",
+      "torchdynamo=None,\n",
+      "tpu_metrics_debug=False,\n",
+      "tpu_num_cores=None,\n",
+      "use_cpu=False,\n",
+      "use_ipex=False,\n",
+      "use_legacy_prediction_loop=False,\n",
+      "use_mps_device=False,\n",
+      "warmup_ratio=0.1,\n",
+      "warmup_steps=0,\n",
+      "weight_decay=0.0,\n",
+      ")\n",
+      "12/23/2024 06:27:40 - INFO - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Model parameters AbsEmbedderModelArguments(model_name_or_path='BAAI/bge-large-en-v1.5', config_name=None, tokenizer_name=None, cache_dir='./cache/model', trust_remote_code=False, token=None)\n",
+      "12/23/2024 06:27:40 - INFO - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Data parameters AbsEmbedderDataArguments(train_data=['./ft_data/training.json'], cache_path='./cache/data', train_group_size=8, query_max_len=512, passage_max_len=512, pad_to_multiple_of=8, max_example_num_per_dataset=100000000, query_instruction_for_retrieval='Represent this sentence for searching relevant passages: ', query_instruction_format='{}{}', knowledge_distillation=False, passage_instruction_for_retrieval=None, passage_instruction_format='{}{}', shuffle_ratio=0.0, same_dataset_within_batch=False, small_threshold=0, drop_threshold=0)\n",
+      "12/23/2024 06:27:40 - WARNING - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, 16-bits training: True\n",
+      "12/23/2024 06:35:01 - INFO - FlagEmbedding.finetune.embedder.encoder_only.base.runner -   Config: BertConfig {\n",
+      "  \"_name_or_path\": \"BAAI/bge-large-en-v1.5\",\n",
+      "  \"architectures\": [\n",
+      "    \"BertModel\"\n",
+      "  ],\n",
+      "  \"attention_probs_dropout_prob\": 0.1,\n",
+      "  \"classifier_dropout\": null,\n",
+      "  \"gradient_checkpointing\": false,\n",
+      "  \"hidden_act\": \"gelu\",\n",
+      "  \"hidden_dropout_prob\": 0.1,\n",
+      "  \"hidden_size\": 1024,\n",
+      "  \"id2label\": {\n",
+      "    \"0\": \"LABEL_0\"\n",
+      "  },\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 4096,\n",
+      "  \"label2id\": {\n",
+      "    \"LABEL_0\": 0\n",
+      "  },\n",
+      "  \"layer_norm_eps\": 1e-12,\n",
+      "  \"max_position_embeddings\": 512,\n",
+      "  \"model_type\": \"bert\",\n",
+      "  \"num_attention_heads\": 16,\n",
+      "  \"num_hidden_layers\": 24,\n",
+      "  \"pad_token_id\": 0,\n",
+      "  \"position_embedding_type\": \"absolute\",\n",
+      "  \"torch_dtype\": \"float32\",\n",
+      "  \"transformers_version\": \"4.44.2\",\n",
+      "  \"type_vocab_size\": 2,\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 30522\n",
+      "}\n",
+      "\n",
+      "12/23/2024 06:35:01 - INFO - FlagEmbedding.abc.finetune.embedder.AbsDataset -   loading data from ./ft_data/training.json ...\n",
+      "Generating train split: 6300 examples [00:00, 46043.95 examples/s]\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/transformers/deepspeed.py:24: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/transformers/deepspeed.py:24: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
+      "12/23/2024 06:35:02 - WARNING - accelerate.utils.other -   Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1734935704.354551] [job-40fb0ce3-8bfb-46ea-b409-0a2e2a1a3163-master-0:1362491:f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device\n",
+      "[1734935704.383634] [job-40fb0ce3-8bfb-46ea-b409-0a2e2a1a3163-master-0:1362492:f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja...\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n",
+      "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].\n",
+      "  warnings.warn(\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ninja: no work to do.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading extension module fused_adam...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time to load fused_adam op: 1.1966907978057861 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading extension module fused_adam...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time to load fused_adam op: 1.2037739753723145 seconds\n",
+      "[2024-12-23 06:35:06,883] [WARNING] [lr_schedules.py:683:get_lr] Attempting to get learning rate from scheduler before it has started\n",
+      "[2024-12-23 06:35:06,888] [WARNING] [lr_schedules.py:683:get_lr] Attempting to get learning rate from scheduler before it has started\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "  0%|          | 0/3150 [00:00<?, ?it/s]/share/project/xzy/Envs/ft/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2888: UserWarning: `max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.\n",
+      "  warnings.warn(\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2888: UserWarning: `max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.\n",
+      "  warnings.warn(\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+      "  return fn(*args, **kwargs)\n",
+      "/share/project/xzy/Envs/ft/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+      "  return fn(*args, **kwargs)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0124, 'grad_norm': 1.0943871958089542, 'learning_rate': 0.0, 'epoch': 0.0}\n",
+      "{'loss': 0.1189, 'grad_norm': 9.971958134471109, 'learning_rate': 1.2049342512977792e-06, 'epoch': 0.0}\n",
+      "{'loss': 0.0067, 'grad_norm': 0.676847884003986, 'learning_rate': 1.9097756041415023e-06, 'epoch': 0.0}\n",
+      "{'loss': 1.5215, 'grad_norm': 40.51544573089919, 'learning_rate': 2.4098685025955585e-06, 'epoch': 0.0}\n",
+      "{'loss': 0.0111, 'grad_norm': 0.8537607081175989, 'learning_rate': 2.7977706905803826e-06, 'epoch': 0.0}\n",
+      "{'loss': 0.0019, 'grad_norm': 0.1699944264536089, 'learning_rate': 3.1147098554392813e-06, 'epoch': 0.0}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.026271846378513198, 'learning_rate': 3.3826781011366144e-06, 'epoch': 0.0}\n",
+      "{'loss': 0.0039, 'grad_norm': 0.3161338881928349, 'learning_rate': 3.614802753893337e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0351, 'grad_norm': 2.335078256835444, 'learning_rate': 3.8195512082830046e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.1005, 'grad_norm': 10.32570731855295, 'learning_rate': 4.002704941878162e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0412, 'grad_norm': 5.065856950874997, 'learning_rate': 4.1683876473185966e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0228, 'grad_norm': 1.4469018394007689, 'learning_rate': 4.319644106737061e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.4375, 'grad_norm': 25.32025705794609, 'learning_rate': 4.458786561250902e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0677, 'grad_norm': 6.604963701770736, 'learning_rate': 4.587612352434394e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0749, 'grad_norm': 6.64658251837619, 'learning_rate': 4.7075462947218845e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0012, 'grad_norm': 0.08107203275012109, 'learning_rate': 4.819737005191117e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0047, 'grad_norm': 0.5208537542790276, 'learning_rate': 4.925123978329471e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0125, 'grad_norm': 1.2911095750936001, 'learning_rate': 5.024485459580783e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012096519313285625, 'learning_rate': 5.118473357978383e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0022, 'grad_norm': 0.2015078341112431, 'learning_rate': 5.207639193175941e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.1712907430029178, 'learning_rate': 5.292453705278116e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.1081, 'grad_norm': 9.019625564551282, 'learning_rate': 5.373321898616376e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.7266, 'grad_norm': 24.500381155357818, 'learning_rate': 5.450594738720674e-06, 'epoch': 0.01}\n",
+      "{'loss': 0.4248, 'grad_norm': 20.19468915261394, 'learning_rate': 5.52457835803484e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.029853964107490208, 'learning_rate': 5.595541381160765e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0261, 'grad_norm': 2.4565057480626606, 'learning_rate': 5.663720812548681e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02366100843019717, 'learning_rate': 5.7293268124245064e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.011689008034651378, 'learning_rate': 5.792546603732173e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0134, 'grad_norm': 1.6272253058407824, 'learning_rate': 5.853547693182881e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006106387893044423, 'learning_rate': 5.912480546019664e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.11557933527457331, 'learning_rate': 5.9694808220382294e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.001, 'grad_norm': 0.06355657303327857, 'learning_rate': 6.0246712564888964e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04231412288564197, 'learning_rate': 6.0781632514600984e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0095, 'grad_norm': 0.9815642175717099, 'learning_rate': 6.130058229627251e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.1704, 'grad_norm': 16.072662344910864, 'learning_rate': 6.180448791716996e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0175, 'grad_norm': 1.7137070627210673, 'learning_rate': 6.229419710878563e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0226, 'grad_norm': 1.8205994473990903, 'learning_rate': 6.2770487907848145e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0215, 'grad_norm': 2.1652719581476316, 'learning_rate': 6.323407609276162e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.005, 'grad_norm': 0.4481776405019213, 'learning_rate': 6.3685621653924034e-06, 'epoch': 0.02}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.012911016891643791, 'learning_rate': 6.41257344447372e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0039, 'grad_norm': 0.36362350912565455, 'learning_rate': 6.455497913473407e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.6172, 'grad_norm': 17.30483211000525, 'learning_rate': 6.497387956575896e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0012, 'grad_norm': 0.12979219617041665, 'learning_rate': 6.538292259550499e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009205114279819068, 'learning_rate': 6.578256149914154e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00013248260316463386, 'learning_rate': 6.617321898863387e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.028528992331809232, 'learning_rate': 6.655528990018454e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.4529, 'grad_norm': 33.029949122917216, 'learning_rate': 6.692914359263185e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.1495, 'grad_norm': 12.704070296547062, 'learning_rate': 6.729512609332619e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009230899786651852, 'learning_rate': 6.765356202273229e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0058, 'grad_norm': 0.7164287401369475, 'learning_rate': 6.800475632458544e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0301, 'grad_norm': 2.9980039313903237, 'learning_rate': 6.834899582470973e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0174, 'grad_norm': 1.3938414847280698, 'learning_rate': 6.868655063846461e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0483, 'grad_norm': 4.752412885883868, 'learning_rate': 6.901767544412343e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03491949623495416, 'learning_rate': 6.934261063722286e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006442951125193213, 'learning_rate': 6.966158337898979e-06, 'epoch': 0.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006299447267779463, 'learning_rate': 6.997480855029952e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04575714216250935, 'learning_rate': 7.028248962119886e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0037, 'grad_norm': 0.3667356673614418, 'learning_rate': 7.058481944480661e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.1401, 'grad_norm': 13.978590807966391, 'learning_rate': 7.0881980983348955e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0082, 'grad_norm': 0.773411220527937, 'learning_rate': 7.1174147973174426e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.037220675051924224, 'learning_rate': 7.1461485534801215e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0172, 'grad_norm': 3.0044581349033983, 'learning_rate': 7.174415073336009e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009021660741710128, 'learning_rate': 7.202229309419618e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03098905288084688, 'learning_rate': 7.229605507786674e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.1153, 'grad_norm': 10.66327299332507, 'learning_rate': 7.256557251831284e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0, 'grad_norm': 8.658231716040802e-05, 'learning_rate': 7.283097502757876e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0021, 'grad_norm': 0.276288806597498, 'learning_rate': 7.309238637009787e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.029148684664189305, 'learning_rate': 7.334992480925029e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0, 'grad_norm': 1.6249292676188826e-05, 'learning_rate': 7.360370342862176e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022599051350145614, 'learning_rate': 7.3853830430147765e-06, 'epoch': 0.04}\n",
+      "{'loss': 0.0039, 'grad_norm': 0.41817978852094234, 'learning_rate': 7.410040941111049e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003506238048649344, 'learning_rate': 7.434353962176341e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0228, 'grad_norm': 2.592674951784186, 'learning_rate': 7.458331620518699e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.8809, 'grad_norm': 35.500713502402846, 'learning_rate': 7.481983042082595e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0058, 'grad_norm': 0.6666792083593676, 'learning_rate': 7.505316985302266e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0233, 'grad_norm': 1.9817286590085412, 'learning_rate': 7.528341860573942e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0017, 'grad_norm': 0.1893725076202954, 'learning_rate': 7.551065748455211e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.5562, 'grad_norm': 18.794788628496033, 'learning_rate': 7.573496416690184e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0447, 'grad_norm': 5.569850394193392, 'learning_rate': 7.595641336150131e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.004, 'grad_norm': 0.5663785467278826, 'learning_rate': 7.617507695771499e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0027, 'grad_norm': 0.30380773688268775, 'learning_rate': 7.639102416566009e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011763562139437564, 'learning_rate': 7.660432164771186e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014761699369449884, 'learning_rate': 7.681503364203827e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.022216559765147807, 'learning_rate': 7.702322207873675e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06359674007945426, 'learning_rate': 7.722894668909854e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0019, 'grad_norm': 0.2518139536122821, 'learning_rate': 7.743226510848278e-06, 'epoch': 0.05}\n",
+      "{'loss': 0.0263, 'grad_norm': 2.5267488938122824, 'learning_rate': 7.763323297324384e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.0616, 'grad_norm': 7.467135075227276, 'learning_rate': 7.783190401211934e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.001, 'grad_norm': 0.1266658025819811, 'learning_rate': 7.802833013245496e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.0716, 'grad_norm': 8.741560871755649, 'learning_rate': 7.822256150161167e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.005, 'grad_norm': 0.5631213729775463, 'learning_rate': 7.841464662387516e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.0178, 'grad_norm': 1.8953219621867126, 'learning_rate': 7.860463241316233e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.2737, 'grad_norm': 9.85479717105912, 'learning_rate': 7.879256426179732e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05575413276526391, 'learning_rate': 7.897848610560964e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.0087, 'grad_norm': 0.9155825504188411, 'learning_rate': 7.916244048558767e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004205196159683869, 'learning_rate': 7.934446860630398e-06, 'epoch': 0.06}\n",
+      "{'loss': 1.0498, 'grad_norm': 32.3598333351172, 'learning_rate': 7.952461039131345e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006783047174836668, 'learning_rate': 7.970290453571008e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.0035, 'grad_norm': 0.39935320924582485, 'learning_rate': 7.9879388556016e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.014490093292922888, 'learning_rate': 8.005409883756324e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.2354, 'grad_norm': 19.054733144933028, 'learning_rate': 8.02270706795181e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.4082, 'grad_norm': 18.139425495980262, 'learning_rate': 8.039833833768753e-06, 'epoch': 0.06}\n",
+      "{'loss': 0.1812, 'grad_norm': 13.057449167420321, 'learning_rate': 8.056793506523717e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00020207495407102672, 'learning_rate': 8.073589315144239e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000627891635103603, 'learning_rate': 8.0902243958585e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009426883088804745, 'learning_rate': 8.106701795710122e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0078, 'grad_norm': 0.897852466674907, 'learning_rate': 8.12302447590796e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0112, 'grad_norm': 0.9509203380448877, 'learning_rate': 8.139195315020065e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0017, 'grad_norm': 0.23133676937278733, 'learning_rate': 8.15521711202045e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.4192, 'grad_norm': 25.37948553719741, 'learning_rate': 8.171092589196759e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.4204, 'grad_norm': 19.795290238463288, 'learning_rate': 8.186824394926316e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014896530194693153, 'learning_rate': 8.20241510632773e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0017, 'grad_norm': 0.2203669561896843, 'learning_rate': 8.21786723179461e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.020195405795464756, 'learning_rate': 8.233183213417665e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.2592954611330256, 'learning_rate': 8.248365429301058e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.05879393197800734, 'learning_rate': 8.26341619577844e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01727347406863746, 'learning_rate': 8.278337769533906e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002755908562058548, 'learning_rate': 8.293132349632674e-06, 'epoch': 0.07}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05451468854281905, 'learning_rate': 8.307802079466086e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.002, 'grad_norm': 0.2416422227645724, 'learning_rate': 8.322349048615223e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008323108661198885, 'learning_rate': 8.336775294637193e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.17261882242540805, 'learning_rate': 8.3510828047779e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0, 'grad_norm': 5.458920081315489e-06, 'learning_rate': 8.365273517614908e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008269896482066181, 'learning_rate': 8.379349324633788e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0036, 'grad_norm': 0.4163040808751409, 'learning_rate': 8.393312071741149e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0, 'grad_norm': 8.105982145945815e-05, 'learning_rate': 8.407163560717397e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0188, 'grad_norm': 2.5714668997678274, 'learning_rate': 8.420905550612075e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.2844, 'grad_norm': 13.91564761026543, 'learning_rate': 8.434539759084453e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0069, 'grad_norm': 0.709971851543088, 'learning_rate': 8.448067863692003e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017428519420423528, 'learning_rate': 8.461491503129064e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0, 'grad_norm': 6.997655105811236e-05, 'learning_rate': 8.474812278418107e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.001, 'grad_norm': 0.11578711794708124, 'learning_rate': 8.488031754055657e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007525409163738824, 'learning_rate': 8.501151459114999e-06, 'epoch': 0.08}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04163791256036061, 'learning_rate': 8.514172888307566e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008348799409675504, 'learning_rate': 8.52709750300489e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0543, 'grad_norm': 6.428385383699193, 'learning_rate': 8.53992673222281e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002303689488268243, 'learning_rate': 8.552661973569658e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0967, 'grad_norm': 5.005368783570887, 'learning_rate': 8.565304594159957e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.1658, 'grad_norm': 8.23670859811572, 'learning_rate': 8.577855931495109e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012219924190285838, 'learning_rate': 8.590317294312554e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05283134428272208, 'learning_rate': 8.602689963404688e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0052, 'grad_norm': 0.5453823154211515, 'learning_rate': 8.614975192408828e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016258063092777121, 'learning_rate': 8.627174208569498e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016253396472462868, 'learning_rate': 8.639288213474122e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0386, 'grad_norm': 5.388291359684948, 'learning_rate': 8.651318383763265e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0564, 'grad_norm': 6.067112377408025, 'learning_rate': 8.663265871816479e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.03321977331210523, 'learning_rate': 8.67513180641473e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0052, 'grad_norm': 0.6500985116768186, 'learning_rate': 8.686917293380373e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006503855200065899, 'learning_rate': 8.698623416195571e-06, 'epoch': 0.09}\n",
+      "{'loss': 0.0031, 'grad_norm': 0.2743100277955953, 'learning_rate': 8.710251236600047e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03949286789029727, 'learning_rate': 8.72180179516896e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01687772012985608, 'learning_rate': 8.733276111871722e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001792015999420221, 'learning_rate': 8.744675186612474e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00020375864162622634, 'learning_rate': 8.75599999975299e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.3359, 'grad_norm': 25.32386053897897, 'learning_rate': 8.767251512618613e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.08581767886430765, 'learning_rate': 8.778430667987962e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016741452476529341, 'learning_rate': 8.789538390566977e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003966235975769844, 'learning_rate': 8.800575587447912e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003621518179776956, 'learning_rate': 8.811543148553846e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0684, 'grad_norm': 11.08933258222246, 'learning_rate': 8.822441947069277e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005651242073255558, 'learning_rate': 8.833272839857288e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0026, 'grad_norm': 0.2690413625568831, 'learning_rate': 8.844036667863787e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005948973298077562, 'learning_rate': 8.854734256509333e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0369, 'grad_norm': 4.115289297583782, 'learning_rate': 8.865366416068965e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.2163519718944715, 'learning_rate': 8.87593394204048e-06, 'epoch': 0.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00014395157852293135, 'learning_rate': 8.886437615501607e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00038402104663420915, 'learning_rate': 8.896878203456422e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0048, 'grad_norm': 0.5485321278460471, 'learning_rate': 8.907256459171455e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00023456817084758388, 'learning_rate': 8.917573122501803e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00020892369232796112, 'learning_rate': 8.927828920207633e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0037, 'grad_norm': 0.4292648664487294, 'learning_rate': 8.938024566261389e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.18795061601097413, 'learning_rate': 8.948160762146057e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0089, 'grad_norm': 1.0757080296993264, 'learning_rate': 8.958238197144793e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010423151930465545, 'learning_rate': 8.968257548622162e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0, 'grad_norm': 9.268455183148281e-05, 'learning_rate': 8.97821948229738e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011913184740415278, 'learning_rate': 8.988124652509712e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.2135, 'grad_norm': 18.943584811686993, 'learning_rate': 8.997973702476397e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.3156272812562592, 'learning_rate': 9.007767264543275e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.028135867511035473, 'learning_rate': 9.017505960428414e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004130872418955687, 'learning_rate': 9.027190401458944e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.051, 'grad_norm': 4.20758682025405, 'learning_rate': 9.036821188801332e-06, 'epoch': 0.11}\n",
+      "{'loss': 0.1466, 'grad_norm': 13.462766375868332, 'learning_rate': 9.046398913685295e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0, 'grad_norm': 6.778500392635904e-05, 'learning_rate': 9.055924157621623e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007853603536167136, 'learning_rate': 9.065397492614013e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006440859623730699, 'learning_rate': 9.074819481365197e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.029244644575337903, 'learning_rate': 9.084190677477512e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03736052727806903, 'learning_rate': 9.093511625648068e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010174631099650824, 'learning_rate': 9.102782861858744e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.07430654904012342, 'learning_rate': 9.112004913561122e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0027, 'grad_norm': 0.32579332644905806, 'learning_rate': 9.121178299856546e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.014505816596859124, 'learning_rate': 9.130303531671462e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002476820649320146, 'learning_rate': 9.139381111928178e-06, 'epoch': 0.12}\n",
+      "{'loss': 1.7227, 'grad_norm': 27.66979938220663, 'learning_rate': 9.148411535711168e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0021, 'grad_norm': 0.2935194688534642, 'learning_rate': 9.157395290429125e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015388907769408647, 'learning_rate': 9.166332855972787e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.09953779098031554, 'learning_rate': 9.175224704868788e-06, 'epoch': 0.12}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006615538030624796, 'learning_rate': 9.184071302429544e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.15272381641155983, 'learning_rate': 9.192873106899379e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007146739373684103, 'learning_rate': 9.201630569596949e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00042865926714917, 'learning_rate': 9.210344135054102e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.7959, 'grad_norm': 28.307491710041397, 'learning_rate': 9.219014241151289e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.023118996618198943, 'learning_rate': 9.22764131924959e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002768596860881457, 'learning_rate': 9.236225794319495e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0663, 'grad_norm': 9.214921162641133, 'learning_rate': 9.24476808506653e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.015568199703544882, 'learning_rate': 9.25326860405379e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.11718327594137819, 'learning_rate': 9.261727757821498e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05390961056061274, 'learning_rate': 9.270145947003678e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.191, 'grad_norm': 14.25578859779149, 'learning_rate': 9.278523566442019e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006747447767368555, 'learning_rate': 9.28686100529698e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.437, 'grad_norm': 20.138658899623646, 'learning_rate': 9.295158647156278e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05138136156866065, 'learning_rate': 9.303416870140782e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.001, 'grad_norm': 0.12250046282311026, 'learning_rate': 9.311636047007902e-06, 'epoch': 0.13}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009403121591696251, 'learning_rate': 9.31981654525255e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.1379, 'grad_norm': 8.266141878923927, 'learning_rate': 9.32795872720574e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0019, 'grad_norm': 0.1866309370462203, 'learning_rate': 9.336062950130882e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.09814285637484466, 'learning_rate': 9.344129566317845e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012859570592625442, 'learning_rate': 9.352158923174845e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0111, 'grad_norm': 1.2383528727807813, 'learning_rate': 9.36015136331823e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001884729616154413, 'learning_rate': 9.368107224660202e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00563408961612702, 'learning_rate': 9.376026840494538e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002280744686942088, 'learning_rate': 9.383910539580373e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.026934361535056347, 'learning_rate': 9.391758646224096e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0081, 'grad_norm': 0.7973945644243153, 'learning_rate': 9.399571480359392e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009249397666215697, 'learning_rate': 9.407349357625511e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009020462885609743, 'learning_rate': 9.415092589443769e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.10078347355215689, 'learning_rate': 9.422801483092389e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00014790128457654625, 'learning_rate': 9.430476341779646e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.11210813322276719, 'learning_rate': 9.438117464715445e-06, 'epoch': 0.14}\n",
+      "{'loss': 0.0133, 'grad_norm': 1.5148723345809492, 'learning_rate': 9.445725147181306e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0189, 'grad_norm': 2.378249906249793, 'learning_rate': 9.453299680598836e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008839346669581537, 'learning_rate': 9.460841352596712e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.025, 'grad_norm': 2.797509659519513, 'learning_rate': 9.46835044707622e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005627574526025301, 'learning_rate': 9.475827244275373e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000418120087289687, 'learning_rate': 9.483272020831686e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016457079673598822, 'learning_rate': 9.490685049843568e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.004, 'grad_norm': 0.5606810512186752, 'learning_rate': 9.498066600930455e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.0386065064276979, 'learning_rate': 9.505416940291635e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.25221493152293883, 'learning_rate': 9.512736330763865e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.040724601744872954, 'learning_rate': 9.520025031877758e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02385219502469358, 'learning_rate': 9.527283299913002e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0122, 'grad_norm': 1.2483556422203974, 'learning_rate': 9.53451138795243e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005095174592034607, 'learning_rate': 9.541709545934973e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.001, 'grad_norm': 0.1342821987314779, 'learning_rate': 9.54887802070751e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0, 'grad_norm': 2.1204408663901515e-05, 'learning_rate': 9.55601705607568e-06, 'epoch': 0.15}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010026294810654453, 'learning_rate': 9.563126892853612e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00014014875374689125, 'learning_rate': 9.570207768912687e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.06203841776057057, 'learning_rate': 9.577259919229286e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001175108944249278, 'learning_rate': 9.584283575931568e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0046, 'grad_norm': 0.5165143002398082, 'learning_rate': 9.591278968345328e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005240807854468919, 'learning_rate': 9.598246323038927e-06, 'epoch': 0.16}\n",
+      "{'loss': 1.4277, 'grad_norm': 33.07771352188566, 'learning_rate': 9.60518586386731e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.025152485788717115, 'learning_rate': 9.612097812015178e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.1694, 'grad_norm': 9.62688080877941, 'learning_rate': 9.61898238603927e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0024, 'grad_norm': 0.2888895883701998, 'learning_rate': 9.625839801909852e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.03882763559840843, 'learning_rate': 9.632670273051355e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.1205, 'grad_norm': 9.270393366857176, 'learning_rate': 9.639474010382234e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0679, 'grad_norm': 7.3906985515136325, 'learning_rate': 9.646251222354047e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008636354556782567, 'learning_rate': 9.65300211498978e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.0031, 'grad_norm': 0.3544678692844161, 'learning_rate': 9.659726891921429e-06, 'epoch': 0.16}\n",
+      "{'loss': 0.7959, 'grad_norm': 22.580513227929934, 'learning_rate': 9.666425754426843e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0021, 'grad_norm': 0.21511501180235365, 'learning_rate': 9.673098901465887e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.1976, 'grad_norm': 18.996921490679664, 'learning_rate': 9.679746529715885e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.06095668325719044, 'learning_rate': 9.686368833606421e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010365802671615217, 'learning_rate': 9.692966005353435e-06, 'epoch': 0.17}\n",
+      "{'loss': 1.5166, 'grad_norm': 25.048251775167614, 'learning_rate': 9.699538234992726e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.1159, 'grad_norm': 10.912564348902842, 'learning_rate': 9.706085710412776e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011928100336209804, 'learning_rate': 9.712608617386999e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0148, 'grad_norm': 1.56246528586926, 'learning_rate': 9.719107139605345e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011473754349389821, 'learning_rate': 9.72558145870537e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0401, 'grad_norm': 3.838044933682234, 'learning_rate': 9.73203175430267e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007402956389527396, 'learning_rate': 9.738458204020791e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.18049037935845336, 'learning_rate': 9.744860983520588e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.4207, 'grad_norm': 25.53436118398537, 'learning_rate': 9.751240266529019e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.003966864233428802, 'learning_rate': 9.757596224867439e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0835, 'grad_norm': 9.320264613115665, 'learning_rate': 9.763929028479363e-06, 'epoch': 0.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006016525557005614, 'learning_rate': 9.770238845457734e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0845, 'grad_norm': 8.818735067786127, 'learning_rate': 9.7765258420717e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.4756, 'grad_norm': 27.735506138905237, 'learning_rate': 9.782790182792888e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0022954318739826863, 'learning_rate': 9.789032030321233e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0035, 'grad_norm': 0.3657901839086353, 'learning_rate': 9.795251545610334e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0070433984490409505, 'learning_rate': 9.80144888789235e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006615471215702845, 'learning_rate': 9.807624214702467e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.001, 'grad_norm': 0.12234600596704745, 'learning_rate': 9.81377768190292e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0189, 'grad_norm': 2.054614743877318, 'learning_rate': 9.819909443706607e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0, 'grad_norm': 5.51570079555793e-05, 'learning_rate': 9.82601965270027e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0918, 'grad_norm': 9.419740653385887, 'learning_rate': 9.832108459867276e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03475118776648241, 'learning_rate': 9.838176014610022e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0045, 'grad_norm': 0.5468596865008593, 'learning_rate': 9.844222464771901e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03372121266439379, 'learning_rate': 9.850247956658943e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.191144374838383, 'learning_rate': 9.856252635061042e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0724, 'grad_norm': 9.357495458317969, 'learning_rate': 9.862236643272848e-06, 'epoch': 0.18}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0031265932846622517, 'learning_rate': 9.868200123114258e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0575, 'grad_norm': 6.314147130016229, 'learning_rate': 9.874143214950616e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014069830011972768, 'learning_rate': 9.88006605771251e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0053646914094565845, 'learning_rate': 9.885968788915278e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.1049, 'grad_norm': 7.426105907553118, 'learning_rate': 9.891851544678152e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0201, 'grad_norm': 1.9082549760659397, 'learning_rate': 9.897714459743102e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0127, 'grad_norm': 1.072740071154954, 'learning_rate': 9.90355766749335e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017292207474545061, 'learning_rate': 9.909381299971577e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009376919678379303, 'learning_rate': 9.915185487897825e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.9346, 'grad_norm': 27.96152905213865, 'learning_rate': 9.920970360687114e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.001, 'grad_norm': 0.10010740186188523, 'learning_rate': 9.92673604646674e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0153, 'grad_norm': 1.6178175578812926, 'learning_rate': 9.932482672093313e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0176, 'grad_norm': 1.6049877476447356, 'learning_rate': 9.938210363169501e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0143, 'grad_norm': 1.2441097716813094, 'learning_rate': 9.943919244060504e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0108, 'grad_norm': 0.9010188627205542, 'learning_rate': 9.949609437910255e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0176, 'grad_norm': 1.7578768570033048, 'learning_rate': 9.955281066657357e-06, 'epoch': 0.19}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.004165575398846052, 'learning_rate': 9.960934251050768e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0895, 'grad_norm': 7.857031774354909, 'learning_rate': 9.96656911066522e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0136, 'grad_norm': 1.6409313824085001, 'learning_rate': 9.972185763916392e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012968544040080023, 'learning_rate': 9.977784328075851e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.039256669794124545, 'learning_rate': 9.983364919285742e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012217827433481364, 'learning_rate': 9.988927652573233e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005600997704195384, 'learning_rate': 9.994472641864756e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.019069991745296572, 'learning_rate': 1e-05, 'epoch': 0.2}\n",
+      "{'loss': 0.1221, 'grad_norm': 10.511170394583319, 'learning_rate': 1e-05, 'epoch': 0.2}\n",
+      "{'loss': 0.1941, 'grad_norm': 11.918708110669703, 'learning_rate': 9.99647266313933e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0267, 'grad_norm': 3.5891821981223466, 'learning_rate': 9.99294532627866e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.036275342505005186, 'learning_rate': 9.989417989417989e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.8838, 'grad_norm': 27.914636460304074, 'learning_rate': 9.98589065255732e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.623, 'grad_norm': 26.759531194802815, 'learning_rate': 9.982363315696649e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.036973476972604426, 'learning_rate': 9.97883597883598e-06, 'epoch': 0.2}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000500899448008299, 'learning_rate': 9.97530864197531e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0622, 'grad_norm': 7.085207463082636, 'learning_rate': 9.97178130511464e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.027266435652106798, 'learning_rate': 9.968253968253969e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.08415959897391309, 'learning_rate': 9.9647266313933e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03100230166531855, 'learning_rate': 9.961199294532629e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005825326259385087, 'learning_rate': 9.957671957671959e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0028786165970295873, 'learning_rate': 9.954144620811288e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006423861991158987, 'learning_rate': 9.950617283950618e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013864048369873576, 'learning_rate': 9.947089947089947e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.014644311780873645, 'learning_rate': 9.943562610229278e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.3237, 'grad_norm': 28.076907666988987, 'learning_rate': 9.940035273368608e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005841845190652072, 'learning_rate': 9.936507936507937e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.09461627025504547, 'learning_rate': 9.932980599647268e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0657, 'grad_norm': 6.705948810622073, 'learning_rate': 9.929453262786597e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0372, 'grad_norm': 3.8697813024506162, 'learning_rate': 9.925925925925927e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0087, 'grad_norm': 0.8870920076270147, 'learning_rate': 9.922398589065256e-06, 'epoch': 0.21}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003204100037421878, 'learning_rate': 9.918871252204587e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002433242970985921, 'learning_rate': 9.915343915343916e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0406, 'grad_norm': 4.242614836973447, 'learning_rate': 9.911816578483246e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.4226, 'grad_norm': 22.740253750233308, 'learning_rate': 9.908289241622577e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.07440215275793476, 'learning_rate': 9.904761904761906e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0, 'grad_norm': 5.554640642032555e-05, 'learning_rate': 9.901234567901236e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.067, 'grad_norm': 8.687311184155403, 'learning_rate': 9.897707231040565e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.017548220250582395, 'learning_rate': 9.894179894179896e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0172, 'grad_norm': 1.9011001438570423, 'learning_rate': 9.890652557319224e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002164573169764195, 'learning_rate': 9.887125220458555e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00025659299058053376, 'learning_rate': 9.883597883597884e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02525347798907043, 'learning_rate': 9.880070546737214e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002894717800956254, 'learning_rate': 9.876543209876543e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0, 'grad_norm': 5.971994275383773e-05, 'learning_rate': 9.873015873015874e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0052, 'grad_norm': 0.518502902733707, 'learning_rate': 9.869488536155204e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.0311438865527313, 'learning_rate': 9.865961199294533e-06, 'epoch': 0.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00032698138963533984, 'learning_rate': 9.862433862433864e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018692428089558545, 'learning_rate': 9.858906525573193e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.019011633719173197, 'learning_rate': 9.855379188712523e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.13896780942431722, 'learning_rate': 9.851851851851852e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.30673422872980854, 'learning_rate': 9.848324514991183e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0479, 'grad_norm': 4.061192933136905, 'learning_rate': 9.844797178130512e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003627841294685361, 'learning_rate': 9.841269841269842e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0, 'grad_norm': 8.961601454647273e-05, 'learning_rate': 9.837742504409173e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0501, 'grad_norm': 6.51282556346079, 'learning_rate': 9.834215167548502e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.10736230585724935, 'learning_rate': 9.830687830687832e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0035, 'grad_norm': 0.39889175101802704, 'learning_rate': 9.827160493827161e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0012, 'grad_norm': 0.153110889592384, 'learning_rate': 9.823633156966492e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06444548070254176, 'learning_rate': 9.82010582010582e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0853, 'grad_norm': 8.486958540666738, 'learning_rate': 9.816578483245151e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.1031, 'grad_norm': 9.011390683370678, 'learning_rate': 9.81305114638448e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.005, 'grad_norm': 0.6946434273684176, 'learning_rate': 9.80952380952381e-06, 'epoch': 0.23}\n",
+      "{'loss': 0.0175, 'grad_norm': 1.745648150155651, 'learning_rate': 9.80599647266314e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0055, 'grad_norm': 0.5786281149206172, 'learning_rate': 9.80246913580247e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005490241001501566, 'learning_rate': 9.7989417989418e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.1185, 'grad_norm': 7.0890508875710765, 'learning_rate': 9.79541446208113e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004133020500708654, 'learning_rate': 9.79188712522046e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.004401366188583172, 'learning_rate': 9.788359788359789e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.017, 'grad_norm': 1.3898525450043842, 'learning_rate': 9.78483245149912e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0052, 'grad_norm': 0.4415710782880179, 'learning_rate': 9.781305114638448e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0, 'grad_norm': 7.677172246444142e-05, 'learning_rate': 9.777777777777779e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0558, 'grad_norm': 4.350587362338431, 'learning_rate': 9.774250440917108e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.0200720931498879, 'learning_rate': 9.770723104056438e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002285988071029741, 'learning_rate': 9.767195767195769e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007031716949633706, 'learning_rate': 9.763668430335098e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.048, 'grad_norm': 4.867100698024638, 'learning_rate': 9.760141093474428e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.030848961254567937, 'learning_rate': 9.756613756613757e-06, 'epoch': 0.24}\n",
+      "{'loss': 0.0514, 'grad_norm': 6.409224549996309, 'learning_rate': 9.753086419753087e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.3269, 'grad_norm': 18.079866185326065, 'learning_rate': 9.749559082892416e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0227, 'grad_norm': 2.478086465966512, 'learning_rate': 9.746031746031747e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05641561035240476, 'learning_rate': 9.742504409171076e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.2073, 'grad_norm': 12.571428001701046, 'learning_rate': 9.738977072310406e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.053259703227685105, 'learning_rate': 9.735449735449735e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015299464494716904, 'learning_rate': 9.731922398589066e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0403, 'grad_norm': 4.27932651497323, 'learning_rate': 9.728395061728396e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016430758178673072, 'learning_rate': 9.724867724867725e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006668627649691366, 'learning_rate': 9.721340388007056e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.055018075122055754, 'learning_rate': 9.717813051146385e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008825658121589978, 'learning_rate': 9.714285714285715e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0, 'grad_norm': 2.558311643268485e-05, 'learning_rate': 9.710758377425044e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0, 'grad_norm': 1.1245356924660007e-05, 'learning_rate': 9.707231040564375e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0, 'grad_norm': 3.948049230650812e-05, 'learning_rate': 9.703703703703703e-06, 'epoch': 0.25}\n",
+      "{'loss': 2.1074, 'grad_norm': 38.409477526610374, 'learning_rate': 9.700176366843034e-06, 'epoch': 0.25}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04511967936978909, 'learning_rate': 9.696649029982365e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006174236828837448, 'learning_rate': 9.693121693121693e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001031603375190403, 'learning_rate': 9.689594356261024e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01170081763335406, 'learning_rate': 9.686067019400353e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0, 'grad_norm': 5.188853674066472e-06, 'learning_rate': 9.682539682539683e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001013810635372773, 'learning_rate': 9.679012345679012e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0043, 'grad_norm': 0.5165040585782178, 'learning_rate': 9.675485008818343e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.07455888407813523, 'learning_rate': 9.671957671957672e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005316743134375812, 'learning_rate': 9.668430335097002e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0, 'grad_norm': 8.019356688935676e-05, 'learning_rate': 9.664902998236331e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007677981148811212, 'learning_rate': 9.661375661375663e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.1245, 'grad_norm': 12.541141401847474, 'learning_rate': 9.657848324514992e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0, 'grad_norm': 3.762765089423011e-05, 'learning_rate': 9.654320987654323e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0838, 'grad_norm': 6.604401144042929, 'learning_rate': 9.650793650793652e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00012301365519960016, 'learning_rate': 9.64726631393298e-06, 'epoch': 0.26}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 13%|█▎        | 417/3150 [01:45<10:51,  4.20it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0052, 'grad_norm': 0.5387894588544504, 'learning_rate': 9.643738977072311e-06, 'epoch': 0.26}\n",
+      "{'loss': 0.002, 'grad_norm': 0.20979235778898053, 'learning_rate': 9.64021164021164e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.026038436142895877, 'learning_rate': 9.63668430335097e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018212249686265307, 'learning_rate': 9.6331569664903e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0083, 'grad_norm': 1.033955002999129, 'learning_rate': 9.62962962962963e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.343699549093858, 'learning_rate': 9.62610229276896e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010749272909065962, 'learning_rate': 9.622574955908291e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010975655595019302, 'learning_rate': 9.61904761904762e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0051, 'grad_norm': 0.4788360612721627, 'learning_rate': 9.61552028218695e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011672187140924894, 'learning_rate': 9.61199294532628e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0127, 'grad_norm': 1.235076415288614, 'learning_rate': 9.60846560846561e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0631, 'grad_norm': 7.0567203604857225, 'learning_rate': 9.604938271604939e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0765, 'grad_norm': 5.138492594150226, 'learning_rate': 9.60141093474427e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.1072, 'grad_norm': 5.9901455380395, 'learning_rate': 9.597883597883598e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011286039576731641, 'learning_rate': 9.594356261022927e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007324422148026052, 'learning_rate': 9.59082892416226e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004669892189648679, 'learning_rate': 9.587301587301588e-06, 'epoch': 0.27}\n",
+      "{'loss': 0.1426, 'grad_norm': 13.197811199550642, 'learning_rate': 9.583774250440919e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0046502305784175595, 'learning_rate': 9.580246913580248e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0021, 'grad_norm': 0.223349195650675, 'learning_rate': 9.576719576719578e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003301224761112619, 'learning_rate': 9.573192239858907e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009009747644794541, 'learning_rate': 9.569664902998238e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.012643622890240434, 'learning_rate': 9.566137566137567e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0084, 'grad_norm': 0.76595151517225, 'learning_rate': 9.562610229276897e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0051, 'grad_norm': 0.8434381968497825, 'learning_rate': 9.559082892416226e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005616020609286258, 'learning_rate': 9.555555555555556e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.1694, 'grad_norm': 18.924209412926526, 'learning_rate': 9.552028218694887e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010424435317796127, 'learning_rate': 9.548500881834216e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.043490916339257245, 'learning_rate': 9.544973544973546e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0, 'grad_norm': 6.68762221850048e-05, 'learning_rate': 9.541446208112875e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014054202334219392, 'learning_rate': 9.537918871252206e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.003, 'grad_norm': 0.35417977009636925, 'learning_rate': 9.534391534391535e-06, 'epoch': 0.28}\n",
+      "{'loss': 0.0114, 'grad_norm': 1.6747523708346654, 'learning_rate': 9.530864197530865e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0, 'grad_norm': 2.503336179264217e-06, 'learning_rate': 9.527336860670194e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.3484, 'grad_norm': 21.332894956535142, 'learning_rate': 9.523809523809525e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0704, 'grad_norm': 7.407228043115376, 'learning_rate': 9.520282186948855e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.2233, 'grad_norm': 10.854382493892984, 'learning_rate': 9.516754850088184e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006763692422210834, 'learning_rate': 9.513227513227515e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.10464323125563814, 'learning_rate': 9.509700176366844e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0112, 'grad_norm': 1.8002737658319494, 'learning_rate': 9.506172839506174e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03950459084822429, 'learning_rate': 9.502645502645503e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0081, 'grad_norm': 0.9476139018258405, 'learning_rate': 9.499118165784834e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002810139852314601, 'learning_rate': 9.495590828924162e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003248687324481972, 'learning_rate': 9.492063492063493e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3370187804656313e-05, 'learning_rate': 9.488536155202822e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.2634, 'grad_norm': 22.118658331524514, 'learning_rate': 9.485008818342152e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0, 'grad_norm': 8.84086227984042e-06, 'learning_rate': 9.481481481481483e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006092693151434778, 'learning_rate': 9.477954144620812e-06, 'epoch': 0.29}\n",
+      "{'loss': 0.0511, 'grad_norm': 4.97184513066285, 'learning_rate': 9.474426807760142e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.1105, 'grad_norm': 9.551431470708904, 'learning_rate': 9.470899470899471e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010826850979057682, 'learning_rate': 9.467372134038802e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016705545965563855, 'learning_rate': 9.46384479717813e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005627443940416539, 'learning_rate': 9.460317460317461e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017637923522766854, 'learning_rate': 9.45679012345679e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.7749, 'grad_norm': 23.789451197563384, 'learning_rate': 9.45326278659612e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0, 'grad_norm': 7.995309096063477e-05, 'learning_rate': 9.449735449735451e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00038951766244453887, 'learning_rate': 9.44620811287478e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.06236068464811764, 'learning_rate': 9.44268077601411e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001694052708670376, 'learning_rate': 9.43915343915344e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000439431782319481, 'learning_rate': 9.43562610229277e-06, 'epoch': 0.3}\n",
+      "{'loss': 2.0938, 'grad_norm': 32.86156859356667, 'learning_rate': 9.432098765432099e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0, 'grad_norm': 2.6667594096080437e-05, 'learning_rate': 9.42857142857143e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.3417198966399091, 'learning_rate': 9.425044091710758e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011078894598663748, 'learning_rate': 9.421516754850089e-06, 'epoch': 0.3}\n",
+      "{'loss': 0.0623, 'grad_norm': 7.977362552464131, 'learning_rate': 9.417989417989418e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.02046773391692334, 'learning_rate': 9.414462081128748e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022039384629892231, 'learning_rate': 9.410934744268079e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00639420501258587, 'learning_rate': 9.407407407407408e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01871886291200188, 'learning_rate': 9.403880070546738e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010791559716901737, 'learning_rate': 9.400352733686067e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.31608098583689, 'learning_rate': 9.396825396825398e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05519794972444185, 'learning_rate': 9.393298059964727e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.001, 'grad_norm': 0.08670803091884981, 'learning_rate': 9.389770723104057e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0052, 'grad_norm': 0.5841564065419473, 'learning_rate': 9.386243386243386e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0846, 'grad_norm': 7.669195200488338, 'learning_rate': 9.382716049382717e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001556751232194212, 'learning_rate': 9.379188712522047e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.12388771620843084, 'learning_rate': 9.375661375661376e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.034952368898296775, 'learning_rate': 9.372134038800707e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3114840978388855e-05, 'learning_rate': 9.368606701940036e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010697442324237544, 'learning_rate': 9.365079365079366e-06, 'epoch': 0.31}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0082480794657335, 'learning_rate': 9.361552028218695e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.10762173094392417, 'learning_rate': 9.358024691358025e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00028685552566825256, 'learning_rate': 9.354497354497354e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02336619378084213, 'learning_rate': 9.350970017636685e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.5278, 'grad_norm': 25.117641908797765, 'learning_rate': 9.347442680776014e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005722173301947363, 'learning_rate': 9.343915343915344e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.4949, 'grad_norm': 17.54948621905309, 'learning_rate': 9.340388007054675e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.14646107170394695, 'learning_rate': 9.336860670194004e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013985811057465423, 'learning_rate': 9.333333333333334e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.5957, 'grad_norm': 31.228124179112868, 'learning_rate': 9.329805996472663e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012013351442862003, 'learning_rate': 9.326278659611994e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.07082839295862332, 'learning_rate': 9.322751322751323e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05599493478920188, 'learning_rate': 9.319223985890653e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03774758424385116, 'learning_rate': 9.315696649029982e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.1407, 'grad_norm': 9.952884689136809, 'learning_rate': 9.312169312169313e-06, 'epoch': 0.32}\n",
+      "{'loss': 0.0, 'grad_norm': 2.9845431002689647e-05, 'learning_rate': 9.308641975308643e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.0637212548652235, 'learning_rate': 9.305114638447974e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002740915166792806, 'learning_rate': 9.301587301587303e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0032825947912661162, 'learning_rate': 9.298059964726633e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0107, 'grad_norm': 1.239868061906578, 'learning_rate': 9.294532627865962e-06, 'epoch': 0.33}\n",
+      "{'loss': 1.4463, 'grad_norm': 39.02784400171082, 'learning_rate': 9.291005291005291e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.2534, 'grad_norm': 13.572009150197582, 'learning_rate': 9.287477954144621e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008117765047677844, 'learning_rate': 9.28395061728395e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.1361, 'grad_norm': 10.907640603501576, 'learning_rate': 9.280423280423281e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.1516, 'grad_norm': 16.006840197411503, 'learning_rate': 9.27689594356261e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.0326651847488497, 'learning_rate': 9.273368606701942e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00041121986998010195, 'learning_rate': 9.26984126984127e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04407502343255406, 'learning_rate': 9.266313932980601e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.022330090878106876, 'learning_rate': 9.26278659611993e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018842050610009468, 'learning_rate': 9.25925925925926e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009732819007203106, 'learning_rate': 9.25573192239859e-06, 'epoch': 0.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008740279104904014, 'learning_rate': 9.25220458553792e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.04903269116199547, 'learning_rate': 9.248677248677249e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02098511530822922, 'learning_rate': 9.24514991181658e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003808187533189958, 'learning_rate': 9.241622574955909e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.004, 'grad_norm': 0.3289230722941331, 'learning_rate': 9.238095238095239e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03945174740048061, 'learning_rate': 9.23456790123457e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001456448343672392, 'learning_rate': 9.231040564373899e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.07375150973989174, 'learning_rate': 9.227513227513229e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00012353433984781679, 'learning_rate': 9.223985890652558e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03674043284052274, 'learning_rate': 9.220458553791889e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012973167922937038, 'learning_rate': 9.216931216931217e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0993, 'grad_norm': 7.095295338629874, 'learning_rate': 9.213403880070548e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.3013, 'grad_norm': 7.94463958441292, 'learning_rate': 9.209876543209877e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010281723047042193, 'learning_rate': 9.206349206349207e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0033, 'grad_norm': 0.29479833882917134, 'learning_rate': 9.202821869488538e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.11959528158541345, 'learning_rate': 9.199294532627867e-06, 'epoch': 0.34}\n",
+      "{'loss': 0.0121, 'grad_norm': 1.458181709400027, 'learning_rate': 9.195767195767197e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002892840842918088, 'learning_rate': 9.192239858906526e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003803119733606415, 'learning_rate': 9.188712522045857e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.06089786155812499, 'learning_rate': 9.185185185185186e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007054187290533145, 'learning_rate': 9.181657848324516e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.03912794620921914, 'learning_rate': 9.178130511463845e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008316480568604834, 'learning_rate': 9.174603174603176e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0, 'grad_norm': 2.8363559877941435e-05, 'learning_rate': 9.171075837742504e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005230903625488281, 'learning_rate': 9.167548500881835e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.23891687040255982, 'learning_rate': 9.164021164021166e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003531969074626214, 'learning_rate': 9.160493827160494e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.2019, 'grad_norm': 22.963392232257412, 'learning_rate': 9.156966490299825e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.039030843066422266, 'learning_rate': 9.153439153439154e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0073, 'grad_norm': 0.5546233180349216, 'learning_rate': 9.149911816578484e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003821190694214784, 'learning_rate': 9.146384479717813e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.1884442138582916, 'learning_rate': 9.142857142857144e-06, 'epoch': 0.35}\n",
+      "{'loss': 0.0287, 'grad_norm': 3.251491791131857, 'learning_rate': 9.139329805996473e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.016972435681201558, 'learning_rate': 9.135802469135803e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.004, 'grad_norm': 0.43338665871002285, 'learning_rate': 9.132275132275134e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.1150372241121039, 'learning_rate': 9.128747795414463e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0029, 'grad_norm': 0.2843923002786516, 'learning_rate': 9.125220458553793e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02143423436497846, 'learning_rate': 9.121693121693122e-06, 'epoch': 0.36}\n",
+      "{'loss': 1.8467, 'grad_norm': 36.21284127277056, 'learning_rate': 9.118165784832453e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0095, 'grad_norm': 1.1975148674314742, 'learning_rate': 9.114638447971782e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00026924198555036287, 'learning_rate': 9.111111111111112e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005284171953739357, 'learning_rate': 9.107583774250441e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003040957127578065, 'learning_rate': 9.104056437389772e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0, 'grad_norm': 2.0002015086789646e-05, 'learning_rate': 9.1005291005291e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.4585, 'grad_norm': 12.468772027706107, 'learning_rate': 9.097001763668431e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.1539, 'grad_norm': 11.185247269047142, 'learning_rate': 9.093474426807762e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00021001007869739246, 'learning_rate': 9.08994708994709e-06, 'epoch': 0.36}\n",
+      "{'loss': 0.0, 'grad_norm': 1.8440867300681122e-05, 'learning_rate': 9.086419753086421e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0, 'grad_norm': 7.081352580937269e-05, 'learning_rate': 9.08289241622575e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012116788264405357, 'learning_rate': 9.07936507936508e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0031, 'grad_norm': 0.3480657887055981, 'learning_rate': 9.07583774250441e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0025071252552526205, 'learning_rate': 9.07231040564374e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0161, 'grad_norm': 1.5625627123645378, 'learning_rate': 9.068783068783069e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.004, 'grad_norm': 0.4820900830142812, 'learning_rate': 9.0652557319224e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001218707438895755, 'learning_rate': 9.06172839506173e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.8765, 'grad_norm': 23.033980767976935, 'learning_rate': 9.058201058201059e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008424474984744178, 'learning_rate': 9.05467372134039e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0012, 'grad_norm': 0.13757991689958848, 'learning_rate': 9.051146384479718e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003590236882311191, 'learning_rate': 9.047619047619049e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.6885, 'grad_norm': 26.055480450827183, 'learning_rate': 9.044091710758378e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.003, 'grad_norm': 0.4125008554160756, 'learning_rate': 9.040564373897708e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.2177, 'grad_norm': 21.233152644867282, 'learning_rate': 9.037037037037037e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004493951791160916, 'learning_rate': 9.033509700176368e-06, 'epoch': 0.37}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013479146978041194, 'learning_rate': 9.029982363315696e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001465321440592504, 'learning_rate': 9.026455026455027e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00027815552126746456, 'learning_rate': 9.022927689594358e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.5859, 'grad_norm': 38.853767312514236, 'learning_rate': 9.019400352733686e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.19520796839565605, 'learning_rate': 9.015873015873017e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008940756161810027, 'learning_rate': 9.012345679012346e-06, 'epoch': 0.38}\n",
+      "{'loss': 1.1172, 'grad_norm': 25.76404136847942, 'learning_rate': 9.008818342151676e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004197920302910475, 'learning_rate': 9.005291005291005e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007062849307986124, 'learning_rate': 9.001763668430336e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0020492712508102585, 'learning_rate': 8.998236331569665e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0, 'grad_norm': 2.6128567919991973e-05, 'learning_rate': 8.994708994708995e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021187182331807084, 'learning_rate': 8.991181657848326e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0164, 'grad_norm': 1.5493341800164313, 'learning_rate': 8.987654320987655e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06926588764437759, 'learning_rate': 8.984126984126985e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003429978997751221, 'learning_rate': 8.980599647266314e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0169, 'grad_norm': 2.3904497668300912, 'learning_rate': 8.977072310405645e-06, 'epoch': 0.38}\n",
+      "{'loss': 0.0, 'grad_norm': 2.2668978872834254e-05, 'learning_rate': 8.973544973544973e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0468, 'grad_norm': 5.606426175856776, 'learning_rate': 8.970017636684304e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0064, 'grad_norm': 0.7068214315932386, 'learning_rate': 8.966490299823633e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001739955958565241, 'learning_rate': 8.962962962962963e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0, 'grad_norm': 8.368342211374271e-05, 'learning_rate': 8.959435626102292e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006923836582717384, 'learning_rate': 8.955908289241625e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007167858605312458, 'learning_rate': 8.952380952380953e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.017944864667511756, 'learning_rate': 8.948853615520284e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.1708, 'grad_norm': 10.272358768817071, 'learning_rate': 8.945326278659613e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.10433918455520556, 'learning_rate': 8.941798941798942e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.24473960640769105, 'learning_rate': 8.938271604938272e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.77, 'grad_norm': 29.296085406026307, 'learning_rate': 8.934744268077601e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0474, 'grad_norm': 5.879123295931101, 'learning_rate': 8.931216931216932e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0041, 'grad_norm': 0.34788167267354597, 'learning_rate': 8.92768959435626e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005503977622482464, 'learning_rate': 8.924162257495591e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05812897927816892, 'learning_rate': 8.920634920634922e-06, 'epoch': 0.39}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.017057393123176952, 'learning_rate': 8.917107583774252e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011785971297019419, 'learning_rate': 8.913580246913581e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.3032, 'grad_norm': 18.870777188273227, 'learning_rate': 8.910052910052912e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.2118400761001139, 'learning_rate': 8.90652557319224e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.14888039194291128, 'learning_rate': 8.902998236331571e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0029, 'grad_norm': 0.3198094954268452, 'learning_rate': 8.8994708994709e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02115185756424001, 'learning_rate': 8.89594356261023e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016344557090485282, 'learning_rate': 8.89241622574956e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003896253897637417, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.001, 'grad_norm': 0.13779857497026907, 'learning_rate': 8.88536155202822e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021783838004051904, 'learning_rate': 8.88183421516755e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0053, 'grad_norm': 0.6245147490260117, 'learning_rate': 8.87830687830688e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.0154813706686109, 'learning_rate': 8.874779541446209e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0, 'grad_norm': 4.120536711663732e-05, 'learning_rate': 8.87125220458554e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0, 'grad_norm': 7.248387693988586e-05, 'learning_rate': 8.867724867724868e-06, 'epoch': 0.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016204137956638305, 'learning_rate': 8.864197530864199e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.15313644682663977, 'learning_rate': 8.860670194003528e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0284, 'grad_norm': 1.8067811653528705, 'learning_rate': 8.857142857142858e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.1283, 'grad_norm': 12.721806489314421, 'learning_rate': 8.853615520282187e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011624943236735643, 'learning_rate': 8.850088183421518e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.09102052237469278, 'learning_rate': 8.846560846560848e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0016, 'grad_norm': 0.1617842193896177, 'learning_rate': 8.843033509700177e-06, 'epoch': 0.41}\n",
+      "{'loss': 1.0879, 'grad_norm': 33.26716618083249, 'learning_rate': 8.839506172839508e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.14618742407992097, 'learning_rate': 8.835978835978837e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0, 'grad_norm': 6.098596732647754e-05, 'learning_rate': 8.832451499118167e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008412504205761954, 'learning_rate': 8.828924162257496e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0017, 'grad_norm': 0.12867445045075537, 'learning_rate': 8.825396825396827e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018560069126813386, 'learning_rate': 8.821869488536155e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0356, 'grad_norm': 4.21275095206285, 'learning_rate': 8.818342151675486e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004305533147917828, 'learning_rate': 8.814814814814817e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0167, 'grad_norm': 1.7210581453288152, 'learning_rate': 8.811287477954145e-06, 'epoch': 0.41}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005098653363148452, 'learning_rate': 8.807760141093476e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013374830517134556, 'learning_rate': 8.804232804232805e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.3474, 'grad_norm': 18.552911921622556, 'learning_rate': 8.800705467372135e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007688675363508784, 'learning_rate': 8.797178130511464e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03783461441049915, 'learning_rate': 8.793650793650795e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0, 'grad_norm': 7.273069786736758e-05, 'learning_rate': 8.790123456790124e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.161, 'grad_norm': 18.870431915660905, 'learning_rate': 8.786596119929454e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010706521628314147, 'learning_rate': 8.783068783068783e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0348, 'grad_norm': 3.132434089618753, 'learning_rate': 8.779541446208114e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00042336916949563535, 'learning_rate': 8.776014109347444e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.072778553596028, 'learning_rate': 8.772486772486773e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.4312, 'grad_norm': 30.225831520626215, 'learning_rate': 8.768959435626104e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015560784143454917, 'learning_rate': 8.765432098765432e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.03230311772489046, 'learning_rate': 8.761904761904763e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.08954097169918082, 'learning_rate': 8.758377425044092e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0105, 'grad_norm': 1.017326338473563, 'learning_rate': 8.754850088183422e-06, 'epoch': 0.42}\n",
+      "{'loss': 0.0106, 'grad_norm': 0.9836870544634239, 'learning_rate': 8.751322751322751e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003229351200233285, 'learning_rate': 8.747795414462082e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04249211403843371, 'learning_rate': 8.744268077601412e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.263620795494509, 'learning_rate': 8.740740740740741e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0, 'grad_norm': 6.9247402669845175e-06, 'learning_rate': 8.737213403880072e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0227, 'grad_norm': 2.3725958250288, 'learning_rate': 8.7336860670194e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0037, 'grad_norm': 0.36718031693595365, 'learning_rate': 8.730158730158731e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.8545, 'grad_norm': 23.055614074060816, 'learning_rate': 8.72663139329806e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.259, 'grad_norm': 13.383620291769256, 'learning_rate': 8.72310405643739e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0068, 'grad_norm': 0.9910191242305099, 'learning_rate': 8.71957671957672e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0072, 'grad_norm': 1.1333621566043073, 'learning_rate': 8.71604938271605e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000716941040113439, 'learning_rate': 8.712522045855379e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013733766946291276, 'learning_rate': 8.70899470899471e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.001, 'grad_norm': 0.14866676945221274, 'learning_rate': 8.70546737213404e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007398283022816927, 'learning_rate': 8.701940035273369e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02963145850133267, 'learning_rate': 8.6984126984127e-06, 'epoch': 0.43}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.004225600573562544, 'learning_rate': 8.694885361552028e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002106226924693437, 'learning_rate': 8.691358024691359e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002608405767895165, 'learning_rate': 8.687830687830688e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016234621383929936, 'learning_rate': 8.684303350970018e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016341901233497624, 'learning_rate': 8.680776014109347e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0, 'grad_norm': 2.3207797941936213e-05, 'learning_rate': 8.677248677248678e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06888668834165373, 'learning_rate': 8.673721340388008e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.0377074996488796, 'learning_rate': 8.670194003527337e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0022, 'grad_norm': 0.27235415974439997, 'learning_rate': 8.666666666666668e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.2048, 'grad_norm': 19.55352557849423, 'learning_rate': 8.663139329805997e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0, 'grad_norm': 7.105547997844708e-07, 'learning_rate': 8.659611992945327e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0708, 'grad_norm': 6.6357007247830815, 'learning_rate': 8.656084656084656e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0012, 'grad_norm': 0.10043246866661085, 'learning_rate': 8.652557319223987e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.562, 'grad_norm': 20.21049507874869, 'learning_rate': 8.649029982363316e-06, 'epoch': 0.44}\n",
+      "{'loss': 2.0391, 'grad_norm': 37.45004648121698, 'learning_rate': 8.645502645502646e-06, 'epoch': 0.44}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.036824255708578994, 'learning_rate': 8.641975308641975e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.014301841361321775, 'learning_rate': 8.638447971781306e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.004445269645297784, 'learning_rate': 8.634920634920636e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004544745623746442, 'learning_rate': 8.631393298059965e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0222, 'grad_norm': 2.2433696116234323, 'learning_rate': 8.627865961199296e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.0760574122706384, 'learning_rate': 8.624338624338624e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.016113381493863484, 'learning_rate': 8.620811287477955e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0152, 'grad_norm': 1.80880202212653, 'learning_rate': 8.617283950617284e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011322971076026023, 'learning_rate': 8.613756613756614e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3006073640228564e-05, 'learning_rate': 8.610229276895943e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.025902522530526252, 'learning_rate': 8.606701940035274e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03434784260784618, 'learning_rate': 8.603174603174604e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0818, 'grad_norm': 7.903585957651769, 'learning_rate': 8.599647266313935e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.29358080191328023, 'learning_rate': 8.596119929453264e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003665959534066689, 'learning_rate': 8.592592592592593e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005695044617251786, 'learning_rate': 8.589065255731923e-06, 'epoch': 0.45}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010231531160191439, 'learning_rate': 8.585537918871252e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.001, 'grad_norm': 0.10889603271335974, 'learning_rate': 8.582010582010583e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003205144022088216, 'learning_rate': 8.578483245149911e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0186, 'grad_norm': 2.136501266318423, 'learning_rate': 8.574955908289242e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.10194565409476601, 'learning_rate': 8.571428571428571e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0469, 'grad_norm': 5.710411120577539, 'learning_rate': 8.567901234567903e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016694484314357814, 'learning_rate': 8.564373897707232e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.4954, 'grad_norm': 30.429634346872703, 'learning_rate': 8.560846560846563e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0045, 'grad_norm': 0.4825823281518718, 'learning_rate': 8.557319223985891e-06, 'epoch': 0.46}\n",
+      "{'loss': 1.1084, 'grad_norm': 36.86766843164804, 'learning_rate': 8.553791887125222e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0185, 'grad_norm': 2.4677376904934674, 'learning_rate': 8.550264550264551e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.324, 'grad_norm': 14.182768583129924, 'learning_rate': 8.546737213403881e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010444752920372944, 'learning_rate': 8.54320987654321e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.22388604664612233, 'learning_rate': 8.53968253968254e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00031651303344376657, 'learning_rate': 8.53615520282187e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021984413101060585, 'learning_rate': 8.5326278659612e-06, 'epoch': 0.46}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.040202866989868996, 'learning_rate': 8.529100529100531e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.057329185439642126, 'learning_rate': 8.52557319223986e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018165353314853185, 'learning_rate': 8.52204585537919e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003078867987902972, 'learning_rate': 8.518518518518519e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015058773387693343, 'learning_rate': 8.51499118165785e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0, 'grad_norm': 4.881086829039972e-05, 'learning_rate': 8.511463844797179e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007213908732729558, 'learning_rate': 8.507936507936509e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002654786242672419, 'learning_rate': 8.504409171075838e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012058408490961978, 'learning_rate': 8.500881834215169e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05370793550945191, 'learning_rate': 8.497354497354499e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0491, 'grad_norm': 10.742294388672756, 'learning_rate': 8.493827160493828e-06, 'epoch': 0.47}\n",
+      "{'loss': 3.0234, 'grad_norm': 34.14742673950246, 'learning_rate': 8.490299823633159e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0016, 'grad_norm': 0.17969612432598073, 'learning_rate': 8.486772486772487e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.07822615512066358, 'learning_rate': 8.483245149911818e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0339, 'grad_norm': 3.888682543109111, 'learning_rate': 8.479717813051147e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.8384, 'grad_norm': 28.871385224225474, 'learning_rate': 8.476190476190477e-06, 'epoch': 0.47}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04593262008289041, 'learning_rate': 8.472663139329806e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0413, 'grad_norm': 3.5990813142748936, 'learning_rate': 8.469135802469137e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00024829324805744656, 'learning_rate': 8.465608465608466e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.12134661345722246, 'learning_rate': 8.462081128747796e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008258756148378845, 'learning_rate': 8.458553791887127e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0, 'grad_norm': 1.5261884668456423e-05, 'learning_rate': 8.455026455026456e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006366080998321464, 'learning_rate': 8.451499118165786e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018959590479002033, 'learning_rate': 8.447971781305115e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0027, 'grad_norm': 0.25621303838399806, 'learning_rate': 8.444444444444446e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.1127, 'grad_norm': 13.165113204544674, 'learning_rate': 8.440917107583775e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.1201, 'grad_norm': 12.834868170042995, 'learning_rate': 8.437389770723105e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0026038684905543875, 'learning_rate': 8.433862433862434e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004021736295042177, 'learning_rate': 8.430335097001765e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00032750641938792416, 'learning_rate': 8.426807760141095e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0, 'grad_norm': 8.047626740114877e-05, 'learning_rate': 8.423280423280424e-06, 'epoch': 0.48}\n",
+      "{'loss': 0.0085, 'grad_norm': 0.8704230288000193, 'learning_rate': 8.419753086419754e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004003378932235418, 'learning_rate': 8.416225749559083e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.10374362146779936, 'learning_rate': 8.412698412698414e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0016, 'grad_norm': 0.1916532091067135, 'learning_rate': 8.409171075837743e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.1104, 'grad_norm': 17.01579627869489, 'learning_rate': 8.405643738977073e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0, 'grad_norm': 8.865056815613236e-06, 'learning_rate': 8.402116402116402e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0021, 'grad_norm': 0.17410100991150906, 'learning_rate': 8.398589065255733e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011596584931168029, 'learning_rate': 8.395061728395062e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0069, 'grad_norm': 0.7897315635207697, 'learning_rate': 8.391534391534392e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.5664, 'grad_norm': 25.892507333882445, 'learning_rate': 8.388007054673723e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.12514837221172145, 'learning_rate': 8.384479717813052e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005045917303995932, 'learning_rate': 8.380952380952382e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002328967479897718, 'learning_rate': 8.377425044091711e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.301, 'grad_norm': 19.529403818995362, 'learning_rate': 8.373897707231042e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017831591952772023, 'learning_rate': 8.37037037037037e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.314, 'grad_norm': 16.26918751512637, 'learning_rate': 8.366843033509701e-06, 'epoch': 0.49}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06507235067516395, 'learning_rate': 8.36331569664903e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.050328436631733385, 'learning_rate': 8.35978835978836e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.10099979553933741, 'learning_rate': 8.356261022927691e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008144224107334473, 'learning_rate': 8.35273368606702e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0, 'grad_norm': 9.711874676698687e-06, 'learning_rate': 8.34920634920635e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0848, 'grad_norm': 5.8385652603457645, 'learning_rate': 8.34567901234568e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016051862965025753, 'learning_rate': 8.34215167548501e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.020302463221654438, 'learning_rate': 8.338624338624339e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006851699126722101, 'learning_rate': 8.33509700176367e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.003, 'grad_norm': 0.24614322255902873, 'learning_rate': 8.331569664902998e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00042186707325585205, 'learning_rate': 8.328042328042329e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006511284727551726, 'learning_rate': 8.324514991181658e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.14423411988988016, 'learning_rate': 8.320987654320988e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0, 'grad_norm': 6.763410704958312e-05, 'learning_rate': 8.317460317460319e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.001, 'grad_norm': 0.07120462538197535, 'learning_rate': 8.313932980599648e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.09, 'grad_norm': 7.540055694779124, 'learning_rate': 8.310405643738978e-06, 'epoch': 0.5}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02224578381206101, 'learning_rate': 8.306878306878307e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.016850629721899414, 'learning_rate': 8.303350970017638e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0, 'grad_norm': 4.9959321868405e-06, 'learning_rate': 8.299823633156966e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0028996304096124215, 'learning_rate': 8.296296296296297e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001674502402438329, 'learning_rate': 8.292768959435626e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005044578535445088, 'learning_rate': 8.289241622574956e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0125, 'grad_norm': 1.199607264306644, 'learning_rate': 8.285714285714287e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.019012419790912526, 'learning_rate': 8.282186948853616e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003598468350873431, 'learning_rate': 8.278659611992946e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.017287017767825843, 'learning_rate': 8.275132275132275e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0021, 'grad_norm': 0.2663981321781637, 'learning_rate': 8.271604938271606e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.06172016646172188, 'learning_rate': 8.268077601410935e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0017, 'grad_norm': 0.16455978006722247, 'learning_rate': 8.264550264550265e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.032395869979947554, 'learning_rate': 8.261022927689594e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0037173274347592935, 'learning_rate': 8.257495590828925e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007528026521051833, 'learning_rate': 8.253968253968254e-06, 'epoch': 0.51}\n",
+      "{'loss': 0.0, 'grad_norm': 1.7283513145550473e-05, 'learning_rate': 8.250440917107586e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.019656667513167636, 'learning_rate': 8.246913580246915e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0026641396855203413, 'learning_rate': 8.243386243386245e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.016954450213532965, 'learning_rate': 8.239858906525574e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0046221999821033114, 'learning_rate': 8.236331569664903e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0046, 'grad_norm': 0.5911409375048218, 'learning_rate': 8.232804232804234e-06, 'epoch': 0.52}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 26%|██▌       | 818/3150 [03:29<10:50,  3.58it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0002, 'grad_norm': 0.01366528763197738, 'learning_rate': 8.229276895943562e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0, 'grad_norm': 9.54747062331347e-05, 'learning_rate': 8.225749559082893e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011539470773831022, 'learning_rate': 8.222222222222222e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.2018, 'grad_norm': 16.709705680113448, 'learning_rate': 8.218694885361552e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007446771029235906, 'learning_rate': 8.215167548500883e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.543, 'grad_norm': 22.15912234003999, 'learning_rate': 8.211640211640213e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.030122672840349505, 'learning_rate': 8.208112874779542e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.013163602206137692, 'learning_rate': 8.204585537918873e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0027, 'grad_norm': 0.18348203466131782, 'learning_rate': 8.201058201058202e-06, 'epoch': 0.52}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011142931175322368, 'learning_rate': 8.197530864197532e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.004195917945753851, 'learning_rate': 8.194003527336861e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0432, 'grad_norm': 5.048397059088255, 'learning_rate': 8.190476190476192e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007775929378213971, 'learning_rate': 8.18694885361552e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014470615665657422, 'learning_rate': 8.18342151675485e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011231901538875367, 'learning_rate': 8.179894179894182e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.261, 'grad_norm': 18.480028249215337, 'learning_rate': 8.17636684303351e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.1005, 'grad_norm': 8.747367680920634, 'learning_rate': 8.172839506172841e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.1167, 'grad_norm': 9.290727599264251, 'learning_rate': 8.16931216931217e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012584449809784877, 'learning_rate': 8.1657848324515e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000601819547446242, 'learning_rate': 8.16225749559083e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009366912818183, 'learning_rate': 8.15873015873016e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.4741, 'grad_norm': 17.80390331889813, 'learning_rate': 8.155202821869489e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0043, 'grad_norm': 0.5497521058545757, 'learning_rate': 8.15167548500882e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006521370960876553, 'learning_rate': 8.148148148148148e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.6455, 'grad_norm': 33.86675587239369, 'learning_rate': 8.144620811287479e-06, 'epoch': 0.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015057286054526524, 'learning_rate': 8.14109347442681e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02981323927249134, 'learning_rate': 8.137566137566138e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0, 'grad_norm': 2.586827527538966e-05, 'learning_rate': 8.134038800705469e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0321, 'grad_norm': 3.597969033295142, 'learning_rate': 8.130511463844798e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0033, 'grad_norm': 0.30835846490293184, 'learning_rate': 8.126984126984128e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0022, 'grad_norm': 0.2603916569462428, 'learning_rate': 8.123456790123457e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.174, 'grad_norm': 15.268309561161928, 'learning_rate': 8.119929453262788e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0035394030083839563, 'learning_rate': 8.116402116402117e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007676202238346145, 'learning_rate': 8.112874779541447e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017536123939363702, 'learning_rate': 8.109347442680778e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005869237167366512, 'learning_rate': 8.105820105820107e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0401, 'grad_norm': 3.7321862865536417, 'learning_rate': 8.102292768959437e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.2295, 'grad_norm': 7.993597090916222, 'learning_rate': 8.098765432098766e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0274, 'grad_norm': 1.8120517834217658, 'learning_rate': 8.095238095238097e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0075110602096223455, 'learning_rate': 8.091710758377425e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0020464167610126493, 'learning_rate': 8.088183421516756e-06, 'epoch': 0.54}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.07177925756069908, 'learning_rate': 8.084656084656085e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00066726801114806, 'learning_rate': 8.081128747795415e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.022370325409600468, 'learning_rate': 8.077601410934744e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.2898, 'grad_norm': 7.4943498309344845, 'learning_rate': 8.074074074074075e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.028934450344406188, 'learning_rate': 8.070546737213405e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.3408, 'grad_norm': 24.4835124688534, 'learning_rate': 8.067019400352734e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0059, 'grad_norm': 0.7959642254371811, 'learning_rate': 8.063492063492065e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012446552352009376, 'learning_rate': 8.059964726631394e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0061, 'grad_norm': 0.45608700100700533, 'learning_rate': 8.056437389770724e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004839989327741624, 'learning_rate': 8.052910052910053e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0511, 'grad_norm': 4.670602886811717, 'learning_rate': 8.049382716049384e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011118230793122599, 'learning_rate': 8.045855379188713e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001065913127219991, 'learning_rate': 8.042328042328043e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005895397721604339, 'learning_rate': 8.038800705467374e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.05250136457029408, 'learning_rate': 8.035273368606703e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002524980311282905, 'learning_rate': 8.031746031746033e-06, 'epoch': 0.55}\n",
+      "{'loss': 0.7637, 'grad_norm': 26.684650206450616, 'learning_rate': 8.028218694885362e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0124, 'grad_norm': 1.8020111319423544, 'learning_rate': 8.024691358024692e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021762275855447218, 'learning_rate': 8.021164021164021e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00048585361638826054, 'learning_rate': 8.017636684303352e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.3779, 'grad_norm': 18.707968025067636, 'learning_rate': 8.01410934744268e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.001, 'grad_norm': 0.10774901601105326, 'learning_rate': 8.010582010582011e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04167842916271259, 'learning_rate': 8.00705467372134e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.113, 'grad_norm': 9.254606182637007, 'learning_rate': 8.00352733686067e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006251123955445184, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0332, 'grad_norm': 4.2482742845021555, 'learning_rate': 7.99647266313933e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.08125987566021425, 'learning_rate': 7.99294532627866e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0611, 'grad_norm': 4.411771736887396, 'learning_rate': 7.98941798941799e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019737244507706095, 'learning_rate': 7.98589065255732e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.34659438897203726, 'learning_rate': 7.982363315696649e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0029172669753065768, 'learning_rate': 7.97883597883598e-06, 'epoch': 0.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016008235841995005, 'learning_rate': 7.975308641975308e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002476588824688619, 'learning_rate': 7.971781305114639e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001322500744813061, 'learning_rate': 7.968253968253968e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005766351850001396, 'learning_rate': 7.964726631393298e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0071, 'grad_norm': 0.7887890785801777, 'learning_rate': 7.961199294532629e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.08005268867586862, 'learning_rate': 7.957671957671958e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006704437062038397, 'learning_rate': 7.954144620811288e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.1952714876984518, 'learning_rate': 7.950617283950617e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0047, 'grad_norm': 0.441318013647191, 'learning_rate': 7.947089947089948e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012120764009431347, 'learning_rate': 7.943562610229277e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.039469290598816396, 'learning_rate': 7.940035273368607e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018203047730550132, 'learning_rate': 7.936507936507936e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0083, 'grad_norm': 0.923199881562237, 'learning_rate': 7.932980599647267e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.002, 'grad_norm': 0.2688888551493191, 'learning_rate': 7.929453262786597e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003875604175901354, 'learning_rate': 7.925925925925926e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.1127, 'grad_norm': 5.932678835199247, 'learning_rate': 7.922398589065257e-06, 'epoch': 0.57}\n",
+      "{'loss': 0.0038, 'grad_norm': 0.3595546190607289, 'learning_rate': 7.918871252204586e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.3642995570094504, 'learning_rate': 7.915343915343916e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.1221, 'grad_norm': 10.27193727163966, 'learning_rate': 7.911816578483245e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005650439764808645, 'learning_rate': 7.908289241622576e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00874277474537252, 'learning_rate': 7.904761904761904e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0021213388168119683, 'learning_rate': 7.901234567901235e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022303111898842512, 'learning_rate': 7.897707231040564e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019122829166098174, 'learning_rate': 7.894179894179896e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.4045, 'grad_norm': 22.06884743478031, 'learning_rate': 7.890652557319225e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0202, 'grad_norm': 2.0428089786876753, 'learning_rate': 7.887125220458554e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003809598256685286, 'learning_rate': 7.883597883597884e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05674425871675961, 'learning_rate': 7.880070546737213e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009564351115387099, 'learning_rate': 7.876543209876544e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002080755976389167, 'learning_rate': 7.873015873015873e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006425745445897408, 'learning_rate': 7.869488536155203e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0909, 'grad_norm': 8.544071163009717, 'learning_rate': 7.865961199294532e-06, 'epoch': 0.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00023848917035292888, 'learning_rate': 7.862433862433863e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06071769743493956, 'learning_rate': 7.858906525573193e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008389333275216083, 'learning_rate': 7.855379188712524e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002705258534927597, 'learning_rate': 7.851851851851853e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0, 'grad_norm': 3.604109606746895e-05, 'learning_rate': 7.848324514991183e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002807183749923046, 'learning_rate': 7.844797178130512e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.5972, 'grad_norm': 26.764690037669034, 'learning_rate': 7.841269841269843e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0334, 'grad_norm': 3.342835631085908, 'learning_rate': 7.837742504409172e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0038163927601852347, 'learning_rate': 7.8342151675485e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006061960858466976, 'learning_rate': 7.830687830687831e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006897625639057729, 'learning_rate': 7.82716049382716e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0411, 'grad_norm': 4.510786057544028, 'learning_rate': 7.823633156966492e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.002, 'grad_norm': 0.16688981738724135, 'learning_rate': 7.820105820105821e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002271676819097237, 'learning_rate': 7.816578483245151e-06, 'epoch': 0.59}\n",
+      "{'loss': 1.0898, 'grad_norm': 30.792457112725877, 'learning_rate': 7.81305114638448e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0, 'grad_norm': 3.939256698959672e-06, 'learning_rate': 7.809523809523811e-06, 'epoch': 0.59}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06979931874221132, 'learning_rate': 7.80599647266314e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009009910771464772, 'learning_rate': 7.80246913580247e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0049, 'grad_norm': 0.5208071479572722, 'learning_rate': 7.7989417989418e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.01, 'grad_norm': 1.2919247379703322, 'learning_rate': 7.79541446208113e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0, 'grad_norm': 2.3390956283098967e-05, 'learning_rate': 7.791887125220459e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009077033868249271, 'learning_rate': 7.78835978835979e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009741458943443844, 'learning_rate': 7.78483245149912e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010361953420791729, 'learning_rate': 7.781305114638449e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.04700746297492389, 'learning_rate': 7.77777777777778e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0222, 'grad_norm': 2.1423778179498942, 'learning_rate': 7.774250440917108e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017619233008528002, 'learning_rate': 7.770723104056439e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0036, 'grad_norm': 0.38719431217657285, 'learning_rate': 7.767195767195767e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004991794734748026, 'learning_rate': 7.763668430335098e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.05977621607965379, 'learning_rate': 7.760141093474427e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008572443425155925, 'learning_rate': 7.756613756613757e-06, 'epoch': 0.6}\n",
+      "{'loss': 0.0881, 'grad_norm': 4.718899149779641, 'learning_rate': 7.753086419753088e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.1542819595370822, 'learning_rate': 7.749559082892417e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.03851612309574846, 'learning_rate': 7.746031746031747e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008186494091683937, 'learning_rate': 7.742504409171076e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.052558778764618105, 'learning_rate': 7.738977072310407e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.1367223398554755, 'learning_rate': 7.735449735449736e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006748342862931809, 'learning_rate': 7.731922398589066e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.13083930359822996, 'learning_rate': 7.728395061728395e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.34909767125660696, 'learning_rate': 7.724867724867726e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.016420061181984958, 'learning_rate': 7.721340388007055e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0097, 'grad_norm': 1.0500982965144403, 'learning_rate': 7.717813051146385e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0021884820468119786, 'learning_rate': 7.714285714285716e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002702503441874129, 'learning_rate': 7.710758377425045e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003970984421517217, 'learning_rate': 7.707231040564375e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003042625681359637, 'learning_rate': 7.703703703703704e-06, 'epoch': 0.61}\n",
+      "{'loss': 0.0116, 'grad_norm': 1.2510121062780812, 'learning_rate': 7.700176366843035e-06, 'epoch': 0.61}\n",
+      "{'loss': 3.2188, 'grad_norm': 32.19235589888246, 'learning_rate': 7.696649029982363e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.004617785973185586, 'learning_rate': 7.693121693121694e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009512931852363361, 'learning_rate': 7.689594356261023e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.5762, 'grad_norm': 18.318075223199966, 'learning_rate': 7.686067019400353e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.2094, 'grad_norm': 15.391769568005179, 'learning_rate': 7.682539682539684e-06, 'epoch': 0.62}\n",
+      "{'loss': 1.2949, 'grad_norm': 30.7653393206645, 'learning_rate': 7.679012345679013e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003166902130769312, 'learning_rate': 7.675485008818343e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.09633251432332737, 'learning_rate': 7.671957671957672e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005506288228433534, 'learning_rate': 7.668430335097003e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015381454848170696, 'learning_rate': 7.664902998236332e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.0367548391733793, 'learning_rate': 7.661375661375662e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001258098267606817, 'learning_rate': 7.657848324514991e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.07834750319446594, 'learning_rate': 7.654320987654322e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002310705726260139, 'learning_rate': 7.65079365079365e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0033385041841621755, 'learning_rate': 7.647266313932981e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001532137745248932, 'learning_rate': 7.643738977072312e-06, 'epoch': 0.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0023667449084726214, 'learning_rate': 7.64021164021164e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0516, 'grad_norm': 5.0686444329158755, 'learning_rate': 7.636684303350971e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010510793233403064, 'learning_rate': 7.6331569664903e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00033340568673222875, 'learning_rate': 7.62962962962963e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002884817393953668, 'learning_rate': 7.62610229276896e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.016880072900762766, 'learning_rate': 7.62257495590829e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.4253, 'grad_norm': 29.60121197667895, 'learning_rate': 7.61904761904762e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0166, 'grad_norm': 2.725451923658054, 'learning_rate': 7.615520282186949e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.26824900482789493, 'learning_rate': 7.61199294532628e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.001, 'grad_norm': 0.11006989442203186, 'learning_rate': 7.60846560846561e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.015873227143475617, 'learning_rate': 7.604938271604939e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0043, 'grad_norm': 0.47639718862531993, 'learning_rate': 7.601410934744269e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006231376417985171, 'learning_rate': 7.597883597883599e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.018, 'grad_norm': 2.2003035639356714, 'learning_rate': 7.5943562610229285e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0087, 'grad_norm': 0.9316108746846773, 'learning_rate': 7.590828924162258e-06, 'epoch': 0.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008231630277249667, 'learning_rate': 7.587301587301588e-06, 'epoch': 0.63}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 32%|███▏      | 1000/3150 [04:16<08:47,  4.08it/s]12/23/2024 06:39:23 - INFO - FlagEmbedding.finetune.embedder.encoder_only.base.trainer -   Saving model checkpoint to ./test_encoder_only_base_bge-large-en-v1.5/checkpoint-1000\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0001, 'grad_norm': 0.0052094104905307205, 'learning_rate': 7.583774250440918e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05414591780232195, 'learning_rate': 7.580246913580247e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005033967507836883, 'learning_rate': 7.576719576719578e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01698784361595978, 'learning_rate': 7.573192239858908e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00047723063982967767, 'learning_rate': 7.569664902998237e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.0427643550196247, 'learning_rate': 7.566137566137567e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0189, 'grad_norm': 2.0302958668418953, 'learning_rate': 7.562610229276897e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00556046268974225, 'learning_rate': 7.5590828924162264e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005143339470081945, 'learning_rate': 7.555555555555556e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010744205740422917, 'learning_rate': 7.552028218694886e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010216109619030751, 'learning_rate': 7.548500881834216e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00024233421950172856, 'learning_rate': 7.544973544973545e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0, 'grad_norm': 1.0216342232561567e-05, 'learning_rate': 7.541446208112876e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003740421669930925, 'learning_rate': 7.5379188712522056e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.021698335384745786, 'learning_rate': 7.534391534391535e-06, 'epoch': 0.64}\n",
+      "{'loss': 0.0072, 'grad_norm': 1.054867481840003, 'learning_rate': 7.530864197530865e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0118, 'grad_norm': 1.0080957532165276, 'learning_rate': 7.527336860670195e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005697816971277557, 'learning_rate': 7.523809523809524e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00023787033854209285, 'learning_rate': 7.520282186948854e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016250391226935732, 'learning_rate': 7.516754850088184e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0022453854201322484, 'learning_rate': 7.5132275132275136e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0096, 'grad_norm': 1.0551639752166635, 'learning_rate': 7.509700176366843e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0, 'grad_norm': 6.12781225585108e-05, 'learning_rate': 7.506172839506174e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007724140136321605, 'learning_rate': 7.5026455026455035e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0117, 'grad_norm': 0.8927948603024087, 'learning_rate': 7.499118165784833e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3815692985261067e-05, 'learning_rate': 7.495590828924163e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.044958211362933, 'learning_rate': 7.492063492063493e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.1357, 'grad_norm': 11.84201834306551, 'learning_rate': 7.488536155202822e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0, 'grad_norm': 1.6532686752138374e-06, 'learning_rate': 7.485008818342152e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.15584331874317311, 'learning_rate': 7.481481481481482e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0035, 'grad_norm': 0.36025654436873966, 'learning_rate': 7.4779541446208115e-06, 'epoch': 0.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014822316433413434, 'learning_rate': 7.474426807760141e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010285565627017499, 'learning_rate': 7.470899470899472e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0, 'grad_norm': 8.192606657720842e-05, 'learning_rate': 7.4673721340388015e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.048262416115928485, 'learning_rate': 7.463844797178131e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0028, 'grad_norm': 0.30089557010968654, 'learning_rate': 7.460317460317461e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.1298, 'grad_norm': 8.155572738883441, 'learning_rate': 7.456790123456791e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016864204437984256, 'learning_rate': 7.45326278659612e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011889716772284782, 'learning_rate': 7.44973544973545e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0127, 'grad_norm': 1.510256984644656, 'learning_rate': 7.44620811287478e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004336441954501604, 'learning_rate': 7.4426807760141095e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03405753515090086, 'learning_rate': 7.439153439153439e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05569069417696394, 'learning_rate': 7.43562610229277e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.03878594190505073, 'learning_rate': 7.4320987654320995e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0, 'grad_norm': 2.887554575139712e-05, 'learning_rate': 7.428571428571429e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0346, 'grad_norm': 6.087757156862265, 'learning_rate': 7.425044091710759e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0025767411754557983, 'learning_rate': 7.421516754850089e-06, 'epoch': 0.66}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009095111907369932, 'learning_rate': 7.417989417989418e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.028599141324644705, 'learning_rate': 7.414462081128748e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.4458, 'grad_norm': 0.028599141324644705, 'learning_rate': 7.414462081128748e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010738995827883776, 'learning_rate': 7.410934744268078e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00422952978825761, 'learning_rate': 7.4074074074074075e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007599708799328013, 'learning_rate': 7.403880070546737e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007646528391081275, 'learning_rate': 7.400352733686068e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0092, 'grad_norm': 1.1521183116677731, 'learning_rate': 7.3968253968253975e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.062322360405253165, 'learning_rate': 7.393298059964727e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00583621660242817, 'learning_rate': 7.389770723104057e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.011, 'grad_norm': 0.7165663761744446, 'learning_rate': 7.386243386243387e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01506913531324177, 'learning_rate': 7.382716049382716e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0028, 'grad_norm': 0.281592041883367, 'learning_rate': 7.379188712522046e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004990047321497131, 'learning_rate': 7.375661375661376e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0037314028297612666, 'learning_rate': 7.3721340388007055e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.023560310263205757, 'learning_rate': 7.368606701940035e-06, 'epoch': 0.67}\n",
+      "{'loss': 0.0338, 'grad_norm': 2.2635815053547943, 'learning_rate': 7.3650793650793666e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004171264404611568, 'learning_rate': 7.3615520282186954e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.004, 'grad_norm': 0.623662590560798, 'learning_rate': 7.358024691358025e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003950455204952606, 'learning_rate': 7.354497354497355e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.035785913949495735, 'learning_rate': 7.350970017636685e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0722, 'grad_norm': 5.543554725347259, 'learning_rate': 7.347442680776014e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.1495, 'grad_norm': 8.66195193451903, 'learning_rate': 7.343915343915344e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.033861506603272494, 'learning_rate': 7.340388007054674e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016614092578933683, 'learning_rate': 7.3368606701940034e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.07013870655515837, 'learning_rate': 7.333333333333333e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.292, 'grad_norm': 18.878341682149117, 'learning_rate': 7.3298059964726645e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.056812666359795296, 'learning_rate': 7.326278659611994e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0, 'grad_norm': 6.452788492670502e-05, 'learning_rate': 7.322751322751324e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005099433991959939, 'learning_rate': 7.319223985890654e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0036985517537734803, 'learning_rate': 7.315696649029983e-06, 'epoch': 0.68}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00696819640712324, 'learning_rate': 7.312169312169313e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.1392208368613793, 'learning_rate': 7.308641975308642e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01718100518548384, 'learning_rate': 7.305114638447972e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009250585024950238, 'learning_rate': 7.301587301587301e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.1979, 'grad_norm': 7.639917758279306, 'learning_rate': 7.298059964726631e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05810349124460543, 'learning_rate': 7.2945326278659625e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0, 'grad_norm': 2.821188838800522e-06, 'learning_rate': 7.291005291005292e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004157038450311585, 'learning_rate': 7.287477954144622e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0, 'grad_norm': 8.496457348868182e-05, 'learning_rate': 7.283950617283952e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015336001514569795, 'learning_rate': 7.280423280423281e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00947647083977476, 'learning_rate': 7.276895943562611e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00255975566754674, 'learning_rate': 7.273368606701941e-06, 'epoch': 0.69}\n",
+      "{'loss': 1.416, 'grad_norm': 28.52389113923022, 'learning_rate': 7.2698412698412705e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005028682801166208, 'learning_rate': 7.2663139329806e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006450510601520889, 'learning_rate': 7.26278659611993e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0, 'grad_norm': 1.7259415088122994e-05, 'learning_rate': 7.2592592592592605e-06, 'epoch': 0.69}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.09513226865393849, 'learning_rate': 7.25573192239859e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017292162110746924, 'learning_rate': 7.25220458553792e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.0510805849042812, 'learning_rate': 7.24867724867725e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 1.4329058316929913e-05, 'learning_rate': 7.245149911816579e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015443637650619684, 'learning_rate': 7.241622574955909e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007128459007215735, 'learning_rate': 7.238095238095239e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003886365997567944, 'learning_rate': 7.2345679012345685e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0183, 'grad_norm': 2.210138291049672, 'learning_rate': 7.231040564373898e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00042370063930431706, 'learning_rate': 7.227513227513228e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0096, 'grad_norm': 1.402428032087651, 'learning_rate': 7.2239858906525585e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.038017194098280395, 'learning_rate': 7.220458553791888e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006719318257741503, 'learning_rate': 7.216931216931218e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013119092749737868, 'learning_rate': 7.213403880070548e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00326349357630025, 'learning_rate': 7.209876543209877e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 8.424545405099231e-05, 'learning_rate': 7.206349206349207e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003982439278980188, 'learning_rate': 7.202821869488537e-06, 'epoch': 0.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004100651849259906, 'learning_rate': 7.1992945326278665e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.08376795863096015, 'learning_rate': 7.195767195767196e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003806433257178503, 'learning_rate': 7.192239858906526e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004371985864191575, 'learning_rate': 7.1887125220458564e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002202685446877771, 'learning_rate': 7.185185185185186e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011774378229605727, 'learning_rate': 7.181657848324516e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002363319054221631, 'learning_rate': 7.178130511463846e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 2.1467973049351387e-05, 'learning_rate': 7.174603174603175e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0048, 'grad_norm': 0.5323783727997174, 'learning_rate': 7.171075837742505e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017541108016193648, 'learning_rate': 7.167548500881835e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 4.0251321567609974e-05, 'learning_rate': 7.1640211640211644e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016561387862938241, 'learning_rate': 7.160493827160494e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009459194746214395, 'learning_rate': 7.156966490299824e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.0365645321511539, 'learning_rate': 7.1534391534391544e-06, 'epoch': 0.71}\n",
+      "{'loss': 1.5635, 'grad_norm': 30.519713987757672, 'learning_rate': 7.149911816578484e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03408260351810018, 'learning_rate': 7.146384479717814e-06, 'epoch': 0.71}\n",
+      "{'loss': 0.0, 'grad_norm': 1.84518302899523e-05, 'learning_rate': 7.1428571428571436e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.001, 'grad_norm': 0.07008096535028749, 'learning_rate': 7.139329805996473e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0478, 'grad_norm': 4.544343538622656, 'learning_rate': 7.135802469135803e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.015455315057891281, 'learning_rate': 7.132275132275133e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011322714949523805, 'learning_rate': 7.128747795414462e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0062, 'grad_norm': 0.48513933603777176, 'learning_rate': 7.125220458553792e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009347122476299465, 'learning_rate': 7.121693121693122e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.2876, 'grad_norm': 22.927391548170608, 'learning_rate': 7.118165784832452e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.43529349171255377, 'learning_rate': 7.114638447971782e-06, 'epoch': 0.72}\n",
+      "{'loss': 1.2129, 'grad_norm': 31.46472905976925, 'learning_rate': 7.111111111111112e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007558400303908387, 'learning_rate': 7.1075837742504415e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.9473, 'grad_norm': 32.947486478248614, 'learning_rate': 7.104056437389771e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0031, 'grad_norm': 0.32744210570522636, 'learning_rate': 7.100529100529101e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0058, 'grad_norm': 0.5762258213456911, 'learning_rate': 7.097001763668431e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.0134, 'grad_norm': 1.4035948376660388, 'learning_rate': 7.09347442680776e-06, 'epoch': 0.72}\n",
+      "{'loss': 0.2155, 'grad_norm': 11.43308860880203, 'learning_rate': 7.08994708994709e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03692981608360151, 'learning_rate': 7.08641975308642e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0399, 'grad_norm': 4.691309893759643, 'learning_rate': 7.08289241622575e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0051, 'grad_norm': 0.6600017479671104, 'learning_rate': 7.07936507936508e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0397, 'grad_norm': 5.415865423167414, 'learning_rate': 7.07583774250441e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.23649778660127604, 'learning_rate': 7.0723104056437395e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0, 'grad_norm': 4.2397769633938124e-05, 'learning_rate': 7.068783068783069e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002816934430127056, 'learning_rate': 7.065255731922399e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005174055064146763, 'learning_rate': 7.061728395061729e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.1896, 'grad_norm': 16.320005831249915, 'learning_rate': 7.058201058201058e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.014599018878381819, 'learning_rate': 7.054673721340388e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00028585799784259375, 'learning_rate': 7.051146384479718e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.016, 'grad_norm': 1.6651667123676888, 'learning_rate': 7.047619047619048e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0839, 'grad_norm': 8.956749733228644, 'learning_rate': 7.044091710758378e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0197, 'grad_norm': 2.440982580426738, 'learning_rate': 7.040564373897708e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0057971153095599795, 'learning_rate': 7.0370370370370375e-06, 'epoch': 0.73}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002602340019573528, 'learning_rate': 7.033509700176367e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.1759, 'grad_norm': 16.14519330468679, 'learning_rate': 7.029982363315697e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0, 'grad_norm': 2.605259837080886e-06, 'learning_rate': 7.026455026455027e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.025852572340103224, 'learning_rate': 7.022927689594356e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02538951999754041, 'learning_rate': 7.019400352733686e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0, 'grad_norm': 1.1487750363902109e-05, 'learning_rate': 7.015873015873016e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0278, 'grad_norm': 3.375252749551069, 'learning_rate': 7.012345679012347e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008901956320846217, 'learning_rate': 7.008818342151676e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0753, 'grad_norm': 6.425724852526063, 'learning_rate': 7.005291005291006e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0078, 'grad_norm': 0.7506418660450337, 'learning_rate': 7.0017636684303355e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0052, 'grad_norm': 0.647978309834948, 'learning_rate': 6.998236331569665e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.2922, 'grad_norm': 22.347534371390253, 'learning_rate': 6.994708994708995e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01398207228631595, 'learning_rate': 6.991181657848325e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004311354387104144, 'learning_rate': 6.987654320987654e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003987881264722657, 'learning_rate': 6.984126984126984e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.07976100514079297, 'learning_rate': 6.980599647266314e-06, 'epoch': 0.74}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01311341676984332, 'learning_rate': 6.977072310405645e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00037914276066074935, 'learning_rate': 6.973544973544975e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0020270301044368485, 'learning_rate': 6.9700176366843046e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011975701804869338, 'learning_rate': 6.966490299823634e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.077, 'grad_norm': 6.4133115972413615, 'learning_rate': 6.962962962962964e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01380663842661796, 'learning_rate': 6.959435626102293e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0046, 'grad_norm': 0.4824461679867232, 'learning_rate': 6.9559082892416226e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0273, 'grad_norm': 2.955214795408197, 'learning_rate': 6.952380952380952e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02506719746662877, 'learning_rate': 6.948853615520282e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007588324795858089, 'learning_rate': 6.945326278659612e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014629490682707563, 'learning_rate': 6.941798941798943e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012986769175889113, 'learning_rate': 6.938271604938273e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014282173994417935, 'learning_rate': 6.9347442680776025e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.054009065329887765, 'learning_rate': 6.931216931216932e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.013469697125266569, 'learning_rate': 6.927689594356262e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0036481849621347, 'learning_rate': 6.924162257495592e-06, 'epoch': 0.75}\n",
+      "{'loss': 0.2388, 'grad_norm': 16.65793828656429, 'learning_rate': 6.920634920634921e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.036, 'grad_norm': 3.9414575458133783, 'learning_rate': 6.917107583774251e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0038677663111051585, 'learning_rate': 6.913580246913581e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004980462254933868, 'learning_rate': 6.9100529100529105e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0027583781224838014, 'learning_rate': 6.906525573192241e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0071, 'grad_norm': 0.9381523088194712, 'learning_rate': 6.902998236331571e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.033, 'grad_norm': 3.3877245895427044, 'learning_rate': 6.8994708994709005e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0815, 'grad_norm': 11.15421017930544, 'learning_rate': 6.89594356261023e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007829937250645149, 'learning_rate': 6.89241622574956e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0233, 'grad_norm': 1.6352550726425372, 'learning_rate': 6.88888888888889e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00012110484941271437, 'learning_rate': 6.885361552028219e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0, 'grad_norm': 1.0571337892198487e-06, 'learning_rate': 6.881834215167549e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.11937843103121688, 'learning_rate': 6.878306878306879e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.2343, 'grad_norm': 13.114482625746836, 'learning_rate': 6.8747795414462085e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0035925033018242357, 'learning_rate': 6.871252204585539e-06, 'epoch': 0.76}\n",
+      "{'loss': 0.0085, 'grad_norm': 1.0049684004299497, 'learning_rate': 6.867724867724869e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0, 'grad_norm': 9.755338545220816e-05, 'learning_rate': 6.8641975308641985e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009967386782267362, 'learning_rate': 6.860670194003528e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0, 'grad_norm': 5.698366715222476e-06, 'learning_rate': 6.857142857142858e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00024667003272248265, 'learning_rate': 6.853615520282188e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009430000347265999, 'learning_rate': 6.850088183421517e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0037579841077673625, 'learning_rate': 6.846560846560847e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0, 'grad_norm': 4.238524700475434e-05, 'learning_rate': 6.843033509700177e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004789649979961403, 'learning_rate': 6.8395061728395065e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.022673048479843732, 'learning_rate': 6.835978835978837e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.3150416369962651, 'learning_rate': 6.832451499118167e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.04982283439268212, 'learning_rate': 6.8289241622574965e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0069, 'grad_norm': 0.8015975513193379, 'learning_rate': 6.825396825396826e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007787440526984867, 'learning_rate': 6.821869488536156e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006028302011364759, 'learning_rate': 6.818342151675486e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.013749943871613839, 'learning_rate': 6.814814814814815e-06, 'epoch': 0.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008979756229905207, 'learning_rate': 6.811287477954145e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.045, 'grad_norm': 5.154142931814393, 'learning_rate': 6.807760141093475e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 6.256833099014245e-05, 'learning_rate': 6.8042328042328045e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.2314613272361424, 'learning_rate': 6.800705467372135e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000542507449630724, 'learning_rate': 6.797178130511465e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004802250859695032, 'learning_rate': 6.7936507936507944e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 5.927894685800072e-05, 'learning_rate': 6.790123456790124e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00014478389049537113, 'learning_rate': 6.786596119929454e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01953868947798883, 'learning_rate': 6.783068783068784e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3518681218792391e-05, 'learning_rate': 6.779541446208113e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 1.2345827875742883e-05, 'learning_rate': 6.776014109347443e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0097, 'grad_norm': 1.1724568766165047, 'learning_rate': 6.772486772486773e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003962410628162639, 'learning_rate': 6.7689594356261024e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008161729629221884, 'learning_rate': 6.765432098765433e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.1813, 'grad_norm': 8.450638551680715, 'learning_rate': 6.761904761904763e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0032148108004668025, 'learning_rate': 6.758377425044092e-06, 'epoch': 0.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001660583743240392, 'learning_rate': 6.754850088183422e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004101537101701471, 'learning_rate': 6.751322751322752e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.038691363938802875, 'learning_rate': 6.7477954144620816e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008552783277862931, 'learning_rate': 6.744268077601411e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0, 'grad_norm': 1.5052511995655785e-05, 'learning_rate': 6.740740740740741e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0050196991670432844, 'learning_rate': 6.737213403880071e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.1331, 'grad_norm': 9.768331656159436, 'learning_rate': 6.7336860670194e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.022, 'grad_norm': 2.6281115845690506, 'learning_rate': 6.730158730158731e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009784549764056915, 'learning_rate': 6.726631393298061e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0391, 'grad_norm': 4.223991472307903, 'learning_rate': 6.72310405643739e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007381140794685477, 'learning_rate': 6.71957671957672e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003262478438724964, 'learning_rate': 6.71604938271605e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0021709490400470825, 'learning_rate': 6.7125220458553795e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0, 'grad_norm': 2.8493849100643268e-05, 'learning_rate': 6.708994708994709e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.2681, 'grad_norm': 2.8493849100643268e-05, 'learning_rate': 6.708994708994709e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.1332, 'grad_norm': 14.608575972467658, 'learning_rate': 6.705467372134039e-06, 'epoch': 0.79}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.014198976569425733, 'learning_rate': 6.701940035273369e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0036, 'grad_norm': 0.3958158259117274, 'learning_rate': 6.698412698412698e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.015126811523094889, 'learning_rate': 6.694885361552029e-06, 'epoch': 0.8}\n",
+      "{'loss': 1.1719, 'grad_norm': 39.37976781649747, 'learning_rate': 6.691358024691359e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.03948877766761594, 'learning_rate': 6.687830687830688e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001612820327212316, 'learning_rate': 6.684303350970018e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0036383800064235584, 'learning_rate': 6.680776014109348e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0, 'grad_norm': 3.511275330138322e-05, 'learning_rate': 6.6772486772486775e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.0284992292540727, 'learning_rate': 6.673721340388007e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0048, 'grad_norm': 0.6152744825096476, 'learning_rate': 6.670194003527337e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016529687155787188, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010490322112603617, 'learning_rate': 6.663139329805996e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01503477985770592, 'learning_rate': 6.659611992945327e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001635138442828702, 'learning_rate': 6.656084656084657e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0026, 'grad_norm': 0.3435264857754823, 'learning_rate': 6.652557319223986e-06, 'epoch': 0.8}\n",
+      "{'loss': 0.0, 'grad_norm': 4.3983490211136904e-05, 'learning_rate': 6.649029982363316e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0045, 'grad_norm': 0.7053832347658923, 'learning_rate': 6.645502645502646e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.10511859412299399, 'learning_rate': 6.6419753086419755e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.1285, 'grad_norm': 12.666994860228787, 'learning_rate': 6.638447971781305e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0271, 'grad_norm': 2.4983075134931485, 'learning_rate': 6.634920634920635e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.1178, 'grad_norm': 12.356307892188608, 'learning_rate': 6.631393298059965e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06991773958071115, 'learning_rate': 6.627865961199294e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008418784871821943, 'learning_rate': 6.624338624338626e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.08673329521318661, 'learning_rate': 6.6208112874779555e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010260679783635426, 'learning_rate': 6.617283950617285e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05156289817555172, 'learning_rate': 6.613756613756615e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.06412971883601604, 'learning_rate': 6.610229276895945e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015574390863371928, 'learning_rate': 6.6067019400352735e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.08604994375077703, 'learning_rate': 6.603174603174603e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00036063540962158805, 'learning_rate': 6.599647266313933e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.1616000654479965, 'learning_rate': 6.596119929453263e-06, 'epoch': 0.81}\n",
+      "{'loss': 0.0184, 'grad_norm': 1.9475756736029002, 'learning_rate': 6.592592592592592e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011929853436580275, 'learning_rate': 6.589065255731924e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.18767767673487618, 'learning_rate': 6.5855379188712534e-06, 'epoch': 0.82}\n",
+      "{'loss': 1.1201, 'grad_norm': 31.85004192930423, 'learning_rate': 6.582010582010583e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00124257816544206, 'learning_rate': 6.578483245149913e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007518838257224749, 'learning_rate': 6.5749559082892426e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.007, 'grad_norm': 0.9427058007362588, 'learning_rate': 6.571428571428572e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000817748060954049, 'learning_rate': 6.567901234567902e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.001, 'grad_norm': 0.11436055527995452, 'learning_rate': 6.564373897707232e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008034050260993782, 'learning_rate': 6.560846560846561e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0021836755447270963, 'learning_rate': 6.557319223985891e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00197319617943686, 'learning_rate': 6.553791887125222e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00026150457687293784, 'learning_rate': 6.550264550264551e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002407030427874358, 'learning_rate': 6.546737213403881e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005303564353229092, 'learning_rate': 6.543209876543211e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0051, 'grad_norm': 0.5985251698631026, 'learning_rate': 6.5396825396825405e-06, 'epoch': 0.82}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05052189206254432, 'learning_rate': 6.53615520282187e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017454824091838928, 'learning_rate': 6.5326278659612e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007407145624686701, 'learning_rate': 6.52910052910053e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01622095312095832, 'learning_rate': 6.525573192239859e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0036, 'grad_norm': 0.437417857088119, 'learning_rate': 6.522045855379189e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0022, 'grad_norm': 0.35538173490831915, 'learning_rate': 6.51851851851852e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019511713159257665, 'learning_rate': 6.514991181657849e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007519585010101066, 'learning_rate': 6.511463844797179e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0104, 'grad_norm': 1.345868458993173, 'learning_rate': 6.507936507936509e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017988820040698035, 'learning_rate': 6.5044091710758385e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00031258070328674256, 'learning_rate': 6.500881834215168e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0742, 'grad_norm': 3.803160437338108, 'learning_rate': 6.497354497354498e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.2666790189465057, 'learning_rate': 6.493827160493828e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009560837112109342, 'learning_rate': 6.490299823633157e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0031374480555212917, 'learning_rate': 6.486772486772487e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0, 'grad_norm': 1.7507020237414135e-06, 'learning_rate': 6.483245149911818e-06, 'epoch': 0.83}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02805774643886728, 'learning_rate': 6.479717813051147e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.1648, 'grad_norm': 17.41374354799877, 'learning_rate': 6.476190476190477e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004375707399718375, 'learning_rate': 6.472663139329807e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.9912, 'grad_norm': 32.33952644525156, 'learning_rate': 6.4691358024691365e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00048297324047728233, 'learning_rate': 6.465608465608466e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03785753230602456, 'learning_rate': 6.462081128747796e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0, 'grad_norm': 3.7775695655885114e-05, 'learning_rate': 6.458553791887126e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021831143509571705, 'learning_rate': 6.455026455026455e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0026, 'grad_norm': 0.31971806518970536, 'learning_rate': 6.451499118165785e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004991394737313725, 'learning_rate': 6.447971781305116e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010883113352147327, 'learning_rate': 6.444444444444445e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.003894384741545791, 'learning_rate': 6.440917107583775e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0, 'grad_norm': 9.731538933100775e-05, 'learning_rate': 6.437389770723105e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015211124778902563, 'learning_rate': 6.4338624338624345e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015348289197805636, 'learning_rate': 6.430335097001764e-06, 'epoch': 0.84}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06343983174370159, 'learning_rate': 6.426807760141094e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00990513158995833, 'learning_rate': 6.423280423280424e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003112266101459883, 'learning_rate': 6.419753086419753e-06, 'epoch': 0.85}\n",
+      "{'loss': 1.0391, 'grad_norm': 41.67348463098644, 'learning_rate': 6.416225749559083e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012416951520330427, 'learning_rate': 6.412698412698414e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009845778351733602, 'learning_rate': 6.409171075837743e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000687813510505975, 'learning_rate': 6.405643738977073e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0847, 'grad_norm': 9.822319026065315, 'learning_rate': 6.402116402116403e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.4224, 'grad_norm': 26.52436025640805, 'learning_rate': 6.3985890652557324e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00039081261571562864, 'learning_rate': 6.395061728395062e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0, 'grad_norm': 1.9214520120376235e-05, 'learning_rate': 6.391534391534392e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.025621536792446713, 'learning_rate': 6.388007054673722e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.22423537255852685, 'learning_rate': 6.384479717813051e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01029255903229779, 'learning_rate': 6.380952380952381e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008267011843887887, 'learning_rate': 6.3774250440917116e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.020408730907727324, 'learning_rate': 6.373897707231041e-06, 'epoch': 0.85}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.014217507208493147, 'learning_rate': 6.370370370370371e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0, 'grad_norm': 2.458356439123871e-06, 'learning_rate': 6.366843033509701e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0612, 'grad_norm': 7.660576792781057, 'learning_rate': 6.36331569664903e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.013434584994168393, 'learning_rate': 6.35978835978836e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0021, 'grad_norm': 0.2940361794986855, 'learning_rate': 6.35626102292769e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.028007224675651074, 'learning_rate': 6.3527336860670196e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0019215154243078043, 'learning_rate': 6.349206349206349e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.07215208400130005, 'learning_rate': 6.345679012345679e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0093, 'grad_norm': 0.9380338738167091, 'learning_rate': 6.3421516754850095e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0, 'grad_norm': 6.45935139423181e-05, 'learning_rate': 6.338624338624339e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007205813695631288, 'learning_rate': 6.335097001763669e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.4945854777996918, 'learning_rate': 6.331569664902999e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.016192572565199483, 'learning_rate': 6.328042328042328e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.15037454791647162, 'learning_rate': 6.324514991181658e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0251, 'grad_norm': 3.278809720832364, 'learning_rate': 6.320987654320988e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001729859600712946, 'learning_rate': 6.3174603174603175e-06, 'epoch': 0.86}\n",
+      "{'loss': 0.0042, 'grad_norm': 0.4974184596988603, 'learning_rate': 6.313932980599647e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005149861176770349, 'learning_rate': 6.310405643738977e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002229448136585216, 'learning_rate': 6.3068783068783075e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.3415262077015871, 'learning_rate': 6.303350970017637e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.022220510392511227, 'learning_rate': 6.299823633156967e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0029, 'grad_norm': 0.2932420981262079, 'learning_rate': 6.296296296296297e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0, 'grad_norm': 5.884057806398214e-05, 'learning_rate': 6.292768959435626e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007787109770854364, 'learning_rate': 6.289241622574956e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.009, 'grad_norm': 1.3201066257298681, 'learning_rate': 6.285714285714286e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0, 'grad_norm': 1.717617315809456e-05, 'learning_rate': 6.2821869488536155e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0, 'grad_norm': 3.713027769190574e-05, 'learning_rate': 6.278659611992945e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0, 'grad_norm': 8.444610490207561e-07, 'learning_rate': 6.275132275132275e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.3345, 'grad_norm': 14.37816445723096, 'learning_rate': 6.271604938271606e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007220794949248867, 'learning_rate': 6.268077601410936e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019723545316286526, 'learning_rate': 6.264550264550266e-06, 'epoch': 0.87}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 44%|████▎     | 1377/3150 [05:55<08:04,  3.66it/s]  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0159, 'grad_norm': 1.7536696378495038, 'learning_rate': 6.2610229276895955e-06, 'epoch': 0.87}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.03909537600833843, 'learning_rate': 6.257495590828925e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3072261291430532e-05, 'learning_rate': 6.253968253968254e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0137, 'grad_norm': 0.9366626178635848, 'learning_rate': 6.250440917107584e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06750650731592978, 'learning_rate': 6.2469135802469135e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0066, 'grad_norm': 0.6520149178816838, 'learning_rate': 6.243386243386243e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007519813360458526, 'learning_rate': 6.239858906525573e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.3818, 'grad_norm': 22.781509879347606, 'learning_rate': 6.236331569664904e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004221153474469201, 'learning_rate': 6.232804232804234e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0109, 'grad_norm': 1.572822121648162, 'learning_rate': 6.229276895943564e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.07694564797534252, 'learning_rate': 6.2257495590828935e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00028936206682971026, 'learning_rate': 6.222222222222223e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.033163957424356814, 'learning_rate': 6.218694885361553e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00034297027433202135, 'learning_rate': 6.215167548500883e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0035883481131207115, 'learning_rate': 6.211640211640212e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006395607603385, 'learning_rate': 6.208112874779542e-06, 'epoch': 0.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002213270168573042, 'learning_rate': 6.204585537918871e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.11837623400583744, 'learning_rate': 6.201058201058202e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0030002093811133993, 'learning_rate': 6.197530864197532e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009946665963772006, 'learning_rate': 6.194003527336862e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.018327588257497508, 'learning_rate': 6.1904761904761914e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.023684303635379485, 'learning_rate': 6.186948853615521e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01682003975811877, 'learning_rate': 6.183421516754851e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.022434666290048012, 'learning_rate': 6.1798941798941806e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.3228, 'grad_norm': 22.04439919545715, 'learning_rate': 6.17636684303351e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00020560302331354454, 'learning_rate': 6.17283950617284e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0, 'grad_norm': 8.706993689479989e-05, 'learning_rate': 6.16931216931217e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.3428, 'grad_norm': 21.561197169653447, 'learning_rate': 6.1657848324515e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009356668970567117, 'learning_rate': 6.16225749559083e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.0106, 'grad_norm': 1.0568392244371416, 'learning_rate': 6.15873015873016e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.1631, 'grad_norm': 13.099564383449277, 'learning_rate': 6.155202821869489e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.002, 'grad_norm': 0.23991840797286848, 'learning_rate': 6.151675485008819e-06, 'epoch': 0.89}\n",
+      "{'loss': 0.8276, 'grad_norm': 34.79816770443435, 'learning_rate': 6.148148148148149e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004063081935517296, 'learning_rate': 6.1446208112874785e-06, 'epoch': 0.9}\n",
+      "{'loss': 1.9775, 'grad_norm': 30.64631851265183, 'learning_rate': 6.141093474426808e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0, 'grad_norm': 8.462654327765954e-06, 'learning_rate': 6.137566137566138e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014338373622007515, 'learning_rate': 6.134038800705468e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0, 'grad_norm': 4.209720823898793e-06, 'learning_rate': 6.130511463844798e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0, 'grad_norm': 3.583493195211275e-05, 'learning_rate': 6.126984126984128e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0, 'grad_norm': 7.669957268328695e-05, 'learning_rate': 6.123456790123458e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.07942694392999808, 'learning_rate': 6.119929453262787e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001756344692910597, 'learning_rate': 6.116402116402117e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003111611169981156, 'learning_rate': 6.112874779541447e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.09914724352683323, 'learning_rate': 6.1093474426807765e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.030060144606582885, 'learning_rate': 6.105820105820106e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0, 'grad_norm': 9.51866710647572e-06, 'learning_rate': 6.102292768959436e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.4316, 'grad_norm': 21.395092182143593, 'learning_rate': 6.098765432098766e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05287426038234921, 'learning_rate': 6.095238095238096e-06, 'epoch': 0.9}\n",
+      "{'loss': 0.0024, 'grad_norm': 0.2667109925092273, 'learning_rate': 6.091710758377426e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.030844658914503633, 'learning_rate': 6.088183421516756e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 6.047638794742902e-05, 'learning_rate': 6.084656084656085e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001262551812841917, 'learning_rate': 6.081128747795415e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0654, 'grad_norm': 5.275302404621471, 'learning_rate': 6.077601410934745e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004935918755414255, 'learning_rate': 6.0740740740740745e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002125282738989415, 'learning_rate': 6.070546737213404e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003416060390484243, 'learning_rate': 6.067019400352734e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011348910931247801, 'learning_rate': 6.063492063492064e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.09385535658128984, 'learning_rate': 6.059964726631394e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0036356597894207482, 'learning_rate': 6.056437389770724e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00192254060201484, 'learning_rate': 6.052910052910054e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 5.213619888380536e-05, 'learning_rate': 6.049382716049383e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.06500855896150665, 'learning_rate': 6.045855379188713e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 4.710760127789827e-06, 'learning_rate': 6.042328042328043e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0012, 'grad_norm': 0.17209804216216892, 'learning_rate': 6.0388007054673725e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.0, 'grad_norm': 2.5506338769803994e-05, 'learning_rate': 6.035273368606702e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004636778060598326, 'learning_rate': 6.031746031746032e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015717471326141686, 'learning_rate': 6.028218694885362e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00032611895810546424, 'learning_rate': 6.024691358024692e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0601, 'grad_norm': 7.805917637196245, 'learning_rate': 6.021164021164022e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0427, 'grad_norm': 4.756464925514166, 'learning_rate': 6.017636684303352e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0026, 'grad_norm': 0.3088175408817194, 'learning_rate': 6.014109347442681e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004364761655183345, 'learning_rate': 6.010582010582011e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.118, 'grad_norm': 9.154903377937694, 'learning_rate': 6.007054673721341e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0183, 'grad_norm': 1.9959536986666473, 'learning_rate': 6.0035273368606704e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0155, 'grad_norm': 1.784115811578939, 'learning_rate': 6e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.1058, 'grad_norm': 9.191119045661601, 'learning_rate': 5.99647266313933e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0894, 'grad_norm': 4.3534192382219175, 'learning_rate': 5.99294532627866e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006569737150649437, 'learning_rate': 5.989417989417989e-06, 'epoch': 0.92}\n",
+      "{'loss': 1.0273, 'grad_norm': 27.66448169337863, 'learning_rate': 5.98589065255732e-06, 'epoch': 0.92}\n",
+      "{'loss': 0.4526, 'grad_norm': 22.11994764325256, 'learning_rate': 5.9823633156966496e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00023083278328179507, 'learning_rate': 5.978835978835979e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000338780898640797, 'learning_rate': 5.975308641975309e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004297675084001466, 'learning_rate': 5.971781305114639e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0155, 'grad_norm': 2.421232175116441, 'learning_rate': 5.968253968253968e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04289022091768996, 'learning_rate': 5.964726631393298e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010189797550971045, 'learning_rate': 5.961199294532628e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00029340481897211277, 'learning_rate': 5.9576719576719576e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006298412045424221, 'learning_rate': 5.954144620811287e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008872961649447633, 'learning_rate': 5.950617283950618e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.045942290641796814, 'learning_rate': 5.9470899470899475e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0, 'grad_norm': 3.0894705470515196e-05, 'learning_rate': 5.943562610229277e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.7031, 'grad_norm': 19.840528620168726, 'learning_rate': 5.940035273368607e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0384, 'grad_norm': 4.596404469964939, 'learning_rate': 5.936507936507937e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.2861676764486142, 'learning_rate': 5.932980599647266e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0017, 'grad_norm': 0.1959446975079634, 'learning_rate': 5.929453262786596e-06, 'epoch': 0.93}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.018060842830192197, 'learning_rate': 5.925925925925926e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0117, 'grad_norm': 1.4256008086269667, 'learning_rate': 5.9223985890652555e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00558341902365659, 'learning_rate': 5.918871252204585e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0, 'grad_norm': 1.5804547493817094e-05, 'learning_rate': 5.915343915343917e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0029857742659801837, 'learning_rate': 5.911816578483246e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.1814, 'grad_norm': 17.444318853251517, 'learning_rate': 5.908289241622576e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.11014313129333965, 'learning_rate': 5.904761904761905e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014284293598795794, 'learning_rate': 5.901234567901235e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0112208754314194, 'learning_rate': 5.897707231040564e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.07225347429657751, 'learning_rate': 5.894179894179894e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.5747, 'grad_norm': 34.97353037867978, 'learning_rate': 5.890652557319224e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009191045501348066, 'learning_rate': 5.8871252204585535e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0, 'grad_norm': 2.533272643798383e-06, 'learning_rate': 5.883597883597883e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0035793390531089803, 'learning_rate': 5.880070546737215e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005544934431767598, 'learning_rate': 5.876543209876544e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.16072577303465987, 'learning_rate': 5.873015873015874e-06, 'epoch': 0.94}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.019169015836267737, 'learning_rate': 5.869488536155204e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.012, 'grad_norm': 1.743810060622101, 'learning_rate': 5.8659611992945335e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.26093468007831094, 'learning_rate': 5.862433862433863e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.09329680337106143, 'learning_rate': 5.858906525573193e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001553068705508635, 'learning_rate': 5.855379188712523e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0, 'grad_norm': 5.993665672744202e-05, 'learning_rate': 5.8518518518518515e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0067773483775175275, 'learning_rate': 5.848324514991181e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010013289674446355, 'learning_rate': 5.844797178130513e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011304952370999755, 'learning_rate': 5.841269841269842e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005169762299092592, 'learning_rate': 5.837742504409172e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0047, 'grad_norm': 0.48558289140358546, 'learning_rate': 5.834215167548502e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.07696770975189779, 'learning_rate': 5.8306878306878314e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.3164485385091337, 'learning_rate': 5.827160493827161e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0218, 'grad_norm': 2.82897396938933, 'learning_rate': 5.823633156966491e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012081080405812047, 'learning_rate': 5.820105820105821e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0032, 'grad_norm': 0.3652273779214137, 'learning_rate': 5.81657848324515e-06, 'epoch': 0.95}\n",
+      "{'loss': 0.0, 'grad_norm': 8.17938657385526e-07, 'learning_rate': 5.81305114638448e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03889323007846274, 'learning_rate': 5.8095238095238106e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0113, 'grad_norm': 1.5149256085553466, 'learning_rate': 5.80599647266314e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04259995395282654, 'learning_rate': 5.80246913580247e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.8091, 'grad_norm': 35.017913593772796, 'learning_rate': 5.7989417989418e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.027298689012445258, 'learning_rate': 5.795414462081129e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0026, 'grad_norm': 0.3516974402114878, 'learning_rate': 5.791887125220459e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0723, 'grad_norm': 4.488729987358792, 'learning_rate': 5.788359788359789e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.2484, 'grad_norm': 12.21107932879086, 'learning_rate': 5.7848324514991186e-06, 'epoch': 0.96}\n",
+      "{'loss': 1.667, 'grad_norm': 37.55425274737952, 'learning_rate': 5.781305114638448e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.12222138021716727, 'learning_rate': 5.777777777777778e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0, 'grad_norm': 2.8236542428666193e-06, 'learning_rate': 5.7742504409171085e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0021, 'grad_norm': 0.23830122942101112, 'learning_rate': 5.770723104056438e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010203437232636737, 'learning_rate': 5.767195767195768e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007098021563371264, 'learning_rate': 5.763668430335098e-06, 'epoch': 0.96}\n",
+      "{'loss': 0.0135, 'grad_norm': 1.4168571549794908, 'learning_rate': 5.760141093474427e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.2344929160088199, 'learning_rate': 5.756613756613757e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0894, 'grad_norm': 5.269837457513589, 'learning_rate': 5.753086419753087e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0076, 'grad_norm': 1.0145826419355373, 'learning_rate': 5.7495590828924165e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0, 'grad_norm': 8.335581576433923e-06, 'learning_rate': 5.746031746031746e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0036011636246072987, 'learning_rate': 5.742504409171076e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.019534578635767415, 'learning_rate': 5.7389770723104065e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006729551980856392, 'learning_rate': 5.735449735449736e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.7739, 'grad_norm': 30.26568449110054, 'learning_rate': 5.731922398589066e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02388542284456296, 'learning_rate': 5.728395061728396e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001353159679357558, 'learning_rate': 5.724867724867725e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.09932764734288656, 'learning_rate': 5.721340388007055e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01493944499211583, 'learning_rate': 5.717813051146385e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.03247483843807909, 'learning_rate': 5.7142857142857145e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0034713561717796032, 'learning_rate': 5.710758377425044e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006054286449828665, 'learning_rate': 5.707231040564374e-06, 'epoch': 0.97}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.055448423946283396, 'learning_rate': 5.7037037037037045e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0, 'grad_norm': 7.400665635486355e-06, 'learning_rate': 5.700176366843034e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0042, 'grad_norm': 0.6171120464703217, 'learning_rate': 5.696649029982364e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011076644425020226, 'learning_rate': 5.693121693121694e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00594974713963548, 'learning_rate': 5.689594356261023e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002922587496160047, 'learning_rate': 5.686067019400353e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011976530488063821, 'learning_rate': 5.682539682539683e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0022, 'grad_norm': 0.19665839904915422, 'learning_rate': 5.6790123456790125e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.08494768368174312, 'learning_rate': 5.675485008818342e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0, 'grad_norm': 7.404188078743237e-06, 'learning_rate': 5.671957671957672e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.002, 'grad_norm': 0.27111637779585185, 'learning_rate': 5.6684303350970025e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0, 'grad_norm': 3.3085380105753234e-05, 'learning_rate': 5.664902998236332e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010150251041064226, 'learning_rate': 5.661375661375662e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.028392350719900785, 'learning_rate': 5.657848324514992e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0027, 'grad_norm': 0.28791093845208127, 'learning_rate': 5.654320987654321e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0091, 'grad_norm': 1.1378130806803826, 'learning_rate': 5.650793650793651e-06, 'epoch': 0.98}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05941408221616299, 'learning_rate': 5.647266313932981e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0044, 'grad_norm': 0.46778702367751157, 'learning_rate': 5.6437389770723105e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.18695738877225818, 'learning_rate': 5.64021164021164e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0, 'grad_norm': 2.610812305490361e-06, 'learning_rate': 5.63668430335097e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03400592915161539, 'learning_rate': 5.6331569664903004e-06, 'epoch': 0.99}\n",
+      "{'loss': 1.6797, 'grad_norm': 21.562206321596083, 'learning_rate': 5.62962962962963e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.0385646113974211, 'learning_rate': 5.62610229276896e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0032279579385206094, 'learning_rate': 5.62257495590829e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00041126831716420045, 'learning_rate': 5.619047619047619e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002220727045245227, 'learning_rate': 5.615520282186949e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0, 'grad_norm': 5.242603988562092e-05, 'learning_rate': 5.611992945326279e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00047446645065792536, 'learning_rate': 5.6084656084656084e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000274686930337093, 'learning_rate': 5.604938271604938e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.0160169512574653, 'learning_rate': 5.601410934744268e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0, 'grad_norm': 7.502171147008243e-07, 'learning_rate': 5.597883597883598e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03822289350559787, 'learning_rate': 5.594356261022928e-06, 'epoch': 0.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007156002089380734, 'learning_rate': 5.590828924162258e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.07662010057550996, 'learning_rate': 5.5873015873015876e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008447727424018299, 'learning_rate': 5.583774250440917e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003243405900706901, 'learning_rate': 5.580246913580247e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0, 'grad_norm': 8.97245429360491e-05, 'learning_rate': 5.576719576719577e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0, 'grad_norm': 9.4976878092401e-05, 'learning_rate': 5.573192239858906e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005478179049490882, 'learning_rate': 5.569664902998236e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0044, 'grad_norm': 0.5069476465736646, 'learning_rate': 5.566137566137566e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.1853, 'grad_norm': 6.898590120763581, 'learning_rate': 5.562610229276897e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.05876191366224594, 'learning_rate': 5.559082892416227e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005429537819776038, 'learning_rate': 5.555555555555557e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.08539718405983514, 'learning_rate': 5.5520282186948855e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006038824693148347, 'learning_rate': 5.548500881834215e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.16320288342667555, 'learning_rate': 5.544973544973545e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.015672844227543045, 'learning_rate': 5.541446208112875e-06, 'epoch': 1.0}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.020426847224434913, 'learning_rate': 5.537918871252204e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03786827975782086, 'learning_rate': 5.534391534391534e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0024, 'grad_norm': 0.20076012370365398, 'learning_rate': 5.530864197530864e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0141, 'grad_norm': 1.4330341795780264, 'learning_rate': 5.527336860670195e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0, 'grad_norm': 2.1406633149362134e-05, 'learning_rate': 5.523809523809525e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.045026972397766866, 'learning_rate': 5.520282186948855e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.1245, 'grad_norm': 7.080633598899553, 'learning_rate': 5.516754850088184e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0054535431971471015, 'learning_rate': 5.513227513227514e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03144694088634972, 'learning_rate': 5.509700176366844e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.015659334598592197, 'learning_rate': 5.5061728395061735e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009063113397745086, 'learning_rate': 5.502645502645503e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.3005, 'grad_norm': 20.292413445843035, 'learning_rate': 5.499118165784832e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019271049065540714, 'learning_rate': 5.495590828924162e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.002, 'grad_norm': 0.21997880656127425, 'learning_rate': 5.492063492063493e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00024310954728549384, 'learning_rate': 5.488536155202823e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009597425222629731, 'learning_rate': 5.485008818342153e-06, 'epoch': 1.01}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00781334466391392, 'learning_rate': 5.481481481481482e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0, 'grad_norm': 4.074988105958929e-05, 'learning_rate': 5.477954144620812e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006309431140359905, 'learning_rate': 5.474426807760142e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00014643570579540662, 'learning_rate': 5.4708994708994715e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.016642747310507747, 'learning_rate': 5.467372134038801e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0, 'grad_norm': 1.4639376099042506e-05, 'learning_rate': 5.463844797178131e-06, 'epoch': 1.02}\n",
+      "{'loss': 1.3682, 'grad_norm': 27.080554745287255, 'learning_rate': 5.460317460317461e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000937862901044928, 'learning_rate': 5.456790123456791e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.074, 'grad_norm': 8.45244128834446, 'learning_rate': 5.453262786596121e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00042128443178072534, 'learning_rate': 5.449735449735451e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03314935488659764, 'learning_rate': 5.44620811287478e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0019, 'grad_norm': 0.15335537576343442, 'learning_rate': 5.44268077601411e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.729, 'grad_norm': 27.03456672209789, 'learning_rate': 5.43915343915344e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.2499, 'grad_norm': 10.058126506138038, 'learning_rate': 5.4356261022927694e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.032331008428544604, 'learning_rate': 5.432098765432099e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.17708685581593367, 'learning_rate': 5.428571428571429e-06, 'epoch': 1.02}\n",
+      "{'loss': 0.0, 'grad_norm': 1.024904037468972e-05, 'learning_rate': 5.425044091710759e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0184, 'grad_norm': 1.649667651488269, 'learning_rate': 5.421516754850089e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009565809777155576, 'learning_rate': 5.417989417989419e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003772036042616874, 'learning_rate': 5.4144620811287486e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0019131538143901216, 'learning_rate': 5.410934744268078e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0040838443304312575, 'learning_rate': 5.407407407407408e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 5.0024503127522264e-05, 'learning_rate': 5.403880070546738e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 5.666070092165322e-05, 'learning_rate': 5.400352733686067e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006222361795200565, 'learning_rate': 5.396825396825397e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007999247830844412, 'learning_rate': 5.393298059964727e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0064, 'grad_norm': 0.7821236112855712, 'learning_rate': 5.3897707231040566e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0021066659173580226, 'learning_rate': 5.386243386243387e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.029819034437782575, 'learning_rate': 5.382716049382717e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01323251400894381, 'learning_rate': 5.3791887125220465e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 3.576937846939388e-05, 'learning_rate': 5.375661375661376e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0020000208020583645, 'learning_rate': 5.372134038800706e-06, 'epoch': 1.03}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05385570523387252, 'learning_rate': 5.368606701940036e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0, 'grad_norm': 1.6156075085239376e-05, 'learning_rate': 5.365079365079365e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.1129, 'grad_norm': 9.420129819975694, 'learning_rate': 5.361552028218695e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.016363013969889838, 'learning_rate': 5.358024691358025e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008397505640573469, 'learning_rate': 5.3544973544973545e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000248784517923328, 'learning_rate': 5.350970017636685e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.020841057998101216, 'learning_rate': 5.347442680776015e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.15792794490119835, 'learning_rate': 5.3439153439153445e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0046, 'grad_norm': 0.6320459880454377, 'learning_rate': 5.340388007054674e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.4912, 'grad_norm': 23.691677721548526, 'learning_rate': 5.336860670194004e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00045659014474855005, 'learning_rate': 5.333333333333334e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0022, 'grad_norm': 0.26744190284829433, 'learning_rate': 5.329805996472663e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0, 'grad_norm': 3.1013106060211957e-06, 'learning_rate': 5.326278659611993e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0046, 'grad_norm': 0.5255658166068279, 'learning_rate': 5.322751322751323e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003123495662551566, 'learning_rate': 5.3192239858906525e-06, 'epoch': 1.04}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001076539169762835, 'learning_rate': 5.315696649029983e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01096306928301408, 'learning_rate': 5.312169312169313e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.13974273272403875, 'learning_rate': 5.3086419753086425e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006226822127223948, 'learning_rate': 5.305114638447972e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.1025, 'grad_norm': 7.461874378759319, 'learning_rate': 5.301587301587302e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.13872060456854848, 'learning_rate': 5.298059964726632e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002010231788469045, 'learning_rate': 5.294532627865961e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004998994901020252, 'learning_rate': 5.291005291005291e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.049836559942998626, 'learning_rate': 5.287477954144621e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0028, 'grad_norm': 0.23778096037511273, 'learning_rate': 5.2839506172839505e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0345, 'grad_norm': 4.044060276868931, 'learning_rate': 5.280423280423281e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0, 'grad_norm': 9.755314788040322e-06, 'learning_rate': 5.276895943562611e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007459366173261044, 'learning_rate': 5.2733686067019405e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0019, 'grad_norm': 0.24541725546135296, 'learning_rate': 5.26984126984127e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.22437740454806546, 'learning_rate': 5.2663139329806e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006425520483135045, 'learning_rate': 5.26278659611993e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.0, 'grad_norm': 3.805246166994784e-05, 'learning_rate': 5.259259259259259e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.15697231823812546, 'learning_rate': 5.255731922398589e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.08707649878563374, 'learning_rate': 5.252204585537919e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.002, 'grad_norm': 0.21514196506359667, 'learning_rate': 5.2486772486772485e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.2339, 'grad_norm': 16.33670061136074, 'learning_rate': 5.245149911816579e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005510610555651715, 'learning_rate': 5.241622574955909e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0, 'grad_norm': 3.454706241396886e-05, 'learning_rate': 5.2380952380952384e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.002, 'grad_norm': 0.2414657312710274, 'learning_rate': 5.234567901234568e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0649, 'grad_norm': 7.812926257902719, 'learning_rate': 5.231040564373898e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001111964367497323, 'learning_rate': 5.227513227513228e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0031023195549457583, 'learning_rate': 5.223985890652557e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018374945180971316, 'learning_rate': 5.220458553791887e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010636735593891536, 'learning_rate': 5.216931216931217e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004472560176446026, 'learning_rate': 5.2134038800705464e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.21234232613631412, 'learning_rate': 5.209876543209878e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015406636783498183, 'learning_rate': 5.2063492063492076e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.013710297405042704, 'learning_rate': 5.202821869488537e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00013566404077973703, 'learning_rate': 5.199294532627866e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.016366128063566823, 'learning_rate': 5.195767195767196e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0, 'grad_norm': 1.8976775772993566e-05, 'learning_rate': 5.1922398589065256e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0137, 'grad_norm': 1.254929407345672, 'learning_rate': 5.188712522045855e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0025407625330554433, 'learning_rate': 5.185185185185185e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.023543393060912295, 'learning_rate': 5.181657848324515e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008292574172026155, 'learning_rate': 5.178130511463844e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.07903146875972647, 'learning_rate': 5.174603174603176e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000753041751320378, 'learning_rate': 5.1710758377425055e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010258778127127026, 'learning_rate': 5.167548500881835e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.1853, 'grad_norm': 13.565869172171917, 'learning_rate': 5.164021164021165e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004378359546349093, 'learning_rate': 5.160493827160495e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006841864794164109, 'learning_rate': 5.156966490299824e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0, 'grad_norm': 6.479153520990214e-06, 'learning_rate': 5.153439153439154e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0, 'grad_norm': 3.140999540666604e-05, 'learning_rate': 5.149911816578484e-06, 'epoch': 1.07}\n",
+      "{'loss': 0.0333, 'grad_norm': 3.751887291604236, 'learning_rate': 5.146384479717813e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0, 'grad_norm': 2.074464487957872e-05, 'learning_rate': 5.142857142857142e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0, 'grad_norm': 8.779737367446641e-05, 'learning_rate': 5.139329805996474e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0, 'grad_norm': 8.83672479999859e-05, 'learning_rate': 5.1358024691358035e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.1346776025215606, 'learning_rate': 5.132275132275133e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0265, 'grad_norm': 1.760680440369048, 'learning_rate': 5.128747795414463e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.08009542592330972, 'learning_rate': 5.125220458553793e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00025620955158579665, 'learning_rate': 5.121693121693122e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0233, 'grad_norm': 2.197472104013769, 'learning_rate': 5.118165784832452e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001474019119814793, 'learning_rate': 5.114638447971782e-06, 'epoch': 1.08}\n",
+      "{'loss': 1.0273, 'grad_norm': 23.49261878137538, 'learning_rate': 5.1111111111111115e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0, 'grad_norm': 3.118896335176573e-05, 'learning_rate': 5.107583774250441e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009191255987387445, 'learning_rate': 5.104056437389772e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007813534485371006, 'learning_rate': 5.1005291005291015e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.07597356437318054, 'learning_rate': 5.097001763668431e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002391200998227858, 'learning_rate': 5.093474426807761e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.1704883723849244, 'learning_rate': 5.089947089947091e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010759719796924832, 'learning_rate': 5.08641975308642e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03608214847297454, 'learning_rate': 5.08289241622575e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0, 'grad_norm': 3.1149160462523724e-05, 'learning_rate': 5.07936507936508e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0, 'grad_norm': 1.2399904500714452e-05, 'learning_rate': 5.0758377425044095e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0453, 'grad_norm': 10.813210524658526, 'learning_rate': 5.072310405643739e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.5605, 'grad_norm': 29.971978897679893, 'learning_rate': 5.06878306878307e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0, 'grad_norm': 4.208805743410603e-05, 'learning_rate': 5.0652557319223995e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009825452092981958, 'learning_rate': 5.061728395061729e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.007587210382006612, 'learning_rate': 5.058201058201059e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03594203957879763, 'learning_rate': 5.054673721340389e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0016, 'grad_norm': 0.2256544347699824, 'learning_rate': 5.051146384479718e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00021319459132974238, 'learning_rate': 5.047619047619048e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00026146800170393235, 'learning_rate': 5.044091710758378e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011064254397785212, 'learning_rate': 5.0405643738977074e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008689927677899906, 'learning_rate': 5.037037037037037e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.020713008773939758, 'learning_rate': 5.033509700176368e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002552208516616236, 'learning_rate': 5.0299823633156974e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004388458970745266, 'learning_rate': 5.026455026455027e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.030615896160024504, 'learning_rate': 5.022927689594357e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00012251334197160353, 'learning_rate': 5.0194003527336866e-06, 'epoch': 1.1}\n",
+      "{'loss': 1.1514, 'grad_norm': 22.618810660667485, 'learning_rate': 5.015873015873016e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018876815467050136, 'learning_rate': 5.012345679012346e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.015513836316942456, 'learning_rate': 5.008818342151676e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003123872715917862, 'learning_rate': 5.005291005291005e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004935692939044747, 'learning_rate': 5.001763668430335e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005240408912057505, 'learning_rate': 4.998236331569665e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.2174, 'grad_norm': 16.4392200338783, 'learning_rate': 4.9947089947089946e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0, 'grad_norm': 4.146586979303532e-06, 'learning_rate': 4.991181657848324e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0, 'grad_norm': 1.973422175864017e-05, 'learning_rate': 4.987654320987655e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03073219555303868, 'learning_rate': 4.9841269841269845e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.0366, 'grad_norm': 2.5219144683358725, 'learning_rate': 4.980599647266314e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001325485217223993, 'learning_rate': 4.977072310405644e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0, 'grad_norm': 4.7242246263864396e-05, 'learning_rate': 4.973544973544974e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00013868274963509822, 'learning_rate': 4.970017636684304e-06, 'epoch': 1.11}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 55%|█████▌    | 1745/3150 [07:21<04:42,  4.97it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0009, 'grad_norm': 0.09674130508777008, 'learning_rate': 4.966490299823634e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0059433610812439225, 'learning_rate': 4.962962962962964e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0023787512831038777, 'learning_rate': 4.959435626102293e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0053428588825208895, 'learning_rate': 4.955908289241623e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06979952889194821, 'learning_rate': 4.952380952380953e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.12311688959209235, 'learning_rate': 4.9488536155202825e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005659354743994171, 'learning_rate': 4.945326278659612e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.1051, 'grad_norm': 12.496530280160519, 'learning_rate': 4.941798941798942e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0, 'grad_norm': 7.150547607013999e-05, 'learning_rate': 4.938271604938272e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.11405076724287291, 'learning_rate': 4.934744268077602e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0030610510767908176, 'learning_rate': 4.931216931216932e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007240882152970541, 'learning_rate': 4.927689594356262e-06, 'epoch': 1.11}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03784171842751524, 'learning_rate': 4.924162257495591e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00038246135793825207, 'learning_rate': 4.920634920634921e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00028488683305452573, 'learning_rate': 4.917107583774251e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0318, 'grad_norm': 4.74978396275779, 'learning_rate': 4.9135802469135805e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013182847774006136, 'learning_rate': 4.91005291005291e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003757547151565451, 'learning_rate': 4.90652557319224e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003282077330127417, 'learning_rate': 4.90299823633157e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.312, 'grad_norm': 24.00191744456315, 'learning_rate': 4.8994708994709e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 8.998080715432804e-05, 'learning_rate': 4.89594356261023e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 7.10073494186763e-05, 'learning_rate': 4.89241622574956e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 2.5280527025015438e-05, 'learning_rate': 4.888888888888889e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 8.404484851086188e-06, 'learning_rate': 4.885361552028219e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008575563328075663, 'learning_rate': 4.881834215167549e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005834935623396097, 'learning_rate': 4.8783068783068785e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017185743050595502, 'learning_rate': 4.874779541446208e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.0761488763617114, 'learning_rate': 4.871252204585538e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.061732955320940815, 'learning_rate': 4.867724867724868e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.017099944603758622, 'learning_rate': 4.864197530864198e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0127, 'grad_norm': 1.4618768782846192, 'learning_rate': 4.860670194003528e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02714642677648208, 'learning_rate': 4.857142857142858e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017837780177786357, 'learning_rate': 4.853615520282187e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0020431925277778398, 'learning_rate': 4.850088183421517e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0087, 'grad_norm': 0.7388935678337097, 'learning_rate': 4.846560846560847e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04026840978702523, 'learning_rate': 4.8430335097001764e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014911974988560108, 'learning_rate': 4.839506172839506e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.0535164171703209, 'learning_rate': 4.835978835978836e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0027, 'grad_norm': 0.4621642808066184, 'learning_rate': 4.832451499118166e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006760560838878801, 'learning_rate': 4.828924162257496e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006955237474978831, 'learning_rate': 4.825396825396826e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0, 'grad_norm': 9.712210117026762e-06, 'learning_rate': 4.8218694885361556e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009627079546794466, 'learning_rate': 4.818342151675485e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.0039, 'grad_norm': 0.5180472209516788, 'learning_rate': 4.814814814814815e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 6.474812946112351e-05, 'learning_rate': 4.8112874779541455e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01953610419509878, 'learning_rate': 4.807760141093475e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 2.846890726206697e-06, 'learning_rate': 4.804232804232805e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014600446219415948, 'learning_rate': 4.800705467372135e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0143, 'grad_norm': 1.2038413058122623, 'learning_rate': 4.7971781305114636e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.1907, 'grad_norm': 15.426466732680305, 'learning_rate': 4.793650793650794e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010844705583043408, 'learning_rate': 4.790123456790124e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 8.891671095621804e-05, 'learning_rate': 4.7865961199294535e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010912215063822713, 'learning_rate': 4.783068783068783e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022087627677679737, 'learning_rate': 4.779541446208113e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003237996068905928, 'learning_rate': 4.7760141093474435e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0033716568424497035, 'learning_rate': 4.772486772486773e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.023882208286799617, 'learning_rate': 4.768959435626103e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.052, 'grad_norm': 3.013007099952271, 'learning_rate': 4.765432098765433e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007072863057416513, 'learning_rate': 4.761904761904762e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012592535835318748, 'learning_rate': 4.758377425044092e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.058, 'grad_norm': 4.946124114198081, 'learning_rate': 4.754850088183422e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0, 'grad_norm': 3.5156470691935725e-05, 'learning_rate': 4.7513227513227515e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.20260646548270458, 'learning_rate': 4.747795414462081e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0, 'grad_norm': 9.203472367175885e-06, 'learning_rate': 4.744268077601411e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.11930783060977143, 'learning_rate': 4.7407407407407415e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0282, 'grad_norm': 3.857156463377029, 'learning_rate': 4.737213403880071e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0348, 'grad_norm': 3.1123489649639087, 'learning_rate': 4.733686067019401e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0, 'grad_norm': 7.672189560865386e-05, 'learning_rate': 4.730158730158731e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.02542276251349472, 'learning_rate': 4.72663139329806e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.3574, 'grad_norm': 11.650228709321212, 'learning_rate': 4.72310405643739e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.015947089040973434, 'learning_rate': 4.71957671957672e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002171086866524487, 'learning_rate': 4.7160493827160495e-06, 'epoch': 1.15}\n",
+      "{'loss': 1.1123, 'grad_norm': 22.668068038404815, 'learning_rate': 4.712522045855379e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015082786650412258, 'learning_rate': 4.708994708994709e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0035377210445568766, 'learning_rate': 4.7054673721340395e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008619876774461264, 'learning_rate': 4.701940035273369e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013669765358342617, 'learning_rate': 4.698412698412699e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.046138248543233826, 'learning_rate': 4.694885361552029e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001141179464629869, 'learning_rate': 4.691358024691358e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010630627923450292, 'learning_rate': 4.687830687830688e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00021187785407739745, 'learning_rate': 4.684303350970018e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002838840019658261, 'learning_rate': 4.6807760141093475e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0186, 'grad_norm': 2.4341098978940194, 'learning_rate': 4.677248677248677e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009650626597587486, 'learning_rate': 4.673721340388007e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0047, 'grad_norm': 0.5702162883548825, 'learning_rate': 4.6701940035273374e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007678663344853477, 'learning_rate': 4.666666666666667e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008804668360481353, 'learning_rate': 4.663139329805997e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 1.749328496539461e-05, 'learning_rate': 4.659611992945327e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0829, 'grad_norm': 8.60399247578922, 'learning_rate': 4.656084656084656e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.014499751619890524, 'learning_rate': 4.652557319223987e-06, 'epoch': 1.16}\n",
+      "{'loss': 0.0, 'grad_norm': 2.1074552970506107e-05, 'learning_rate': 4.6490299823633166e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0617, 'grad_norm': 6.20134749997367, 'learning_rate': 4.6455026455026454e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0056447115145635786, 'learning_rate': 4.641975308641975e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0021083385011754427, 'learning_rate': 4.638447971781305e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 4.772806573147897e-05, 'learning_rate': 4.634920634920635e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 2.6043204735758855e-05, 'learning_rate': 4.631393298059965e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0044, 'grad_norm': 0.34825655725156507, 'learning_rate': 4.627865961199295e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 9.412478383218325e-05, 'learning_rate': 4.6243386243386246e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014344513995715549, 'learning_rate': 4.620811287477954e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012613078800787446, 'learning_rate': 4.617283950617285e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.28733033481332915, 'learning_rate': 4.6137566137566145e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0038425025854334164, 'learning_rate': 4.610229276895944e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006413722343104416, 'learning_rate': 4.606701940035274e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003160235522962234, 'learning_rate': 4.603174603174604e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00013798485564357795, 'learning_rate': 4.599647266313933e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02259460615330524, 'learning_rate': 4.596119929453263e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001039981514092437, 'learning_rate': 4.592592592592593e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 5.510793519661592e-05, 'learning_rate': 4.5890652557319225e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0022156687911236645, 'learning_rate': 4.585537918871252e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0038308578546576363, 'learning_rate': 4.582010582010583e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00654036826972274, 'learning_rate': 4.5784832451499125e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 1.259618709043582e-05, 'learning_rate': 4.574955908289242e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 5.870037458080145e-06, 'learning_rate': 4.571428571428572e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 2.4880052930252676e-06, 'learning_rate': 4.567901234567902e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 6.886980436151535e-05, 'learning_rate': 4.564373897707231e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 1.2552208527502968e-05, 'learning_rate': 4.560846560846561e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0017, 'grad_norm': 0.20296631630252643, 'learning_rate': 4.557319223985891e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 3.5505213240897e-06, 'learning_rate': 4.5537918871252205e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.17434725234716691, 'learning_rate': 4.55026455026455e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004054777365811368, 'learning_rate': 4.546737213403881e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.034529238818685325, 'learning_rate': 4.5432098765432105e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009872098351108019, 'learning_rate': 4.53968253968254e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.0, 'grad_norm': 7.791972977077685e-06, 'learning_rate': 4.53615520282187e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02128623087232516, 'learning_rate': 4.5326278659612e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011499070665541932, 'learning_rate': 4.529100529100529e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00030829888164148143, 'learning_rate': 4.525573192239859e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00025958535171482984, 'learning_rate': 4.522045855379189e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012494756414843289, 'learning_rate': 4.5185185185185185e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0047130752603316, 'learning_rate': 4.514991181657848e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0076, 'grad_norm': 0.8224319688437213, 'learning_rate': 4.511463844797179e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 2.093372328170729e-06, 'learning_rate': 4.5079365079365085e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 9.069784403807794e-05, 'learning_rate': 4.504409171075838e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.1248665447396375, 'learning_rate': 4.500881834215168e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022653702500311177, 'learning_rate': 4.497354497354498e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009692942153390522, 'learning_rate': 4.493827160493827e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010265720013098271, 'learning_rate': 4.490299823633157e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04237770662887455, 'learning_rate': 4.486772486772487e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006923778982414307, 'learning_rate': 4.4832451499118165e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00027784777885629485, 'learning_rate': 4.479717813051146e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.035778787314214404, 'learning_rate': 4.476190476190477e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.134, 'grad_norm': 10.360449825816527, 'learning_rate': 4.4726631393298064e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003762259153937235, 'learning_rate': 4.469135802469136e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0, 'grad_norm': 6.861379434405528e-05, 'learning_rate': 4.465608465608466e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.3816, 'grad_norm': 20.01078429340313, 'learning_rate': 4.462081128747796e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005402799754416959, 'learning_rate': 4.458553791887126e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0186, 'grad_norm': 3.0670757753347906, 'learning_rate': 4.455026455026456e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0, 'grad_norm': 3.3848707251278397e-06, 'learning_rate': 4.4514991181657856e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006652564625221815, 'learning_rate': 4.447971781305115e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0, 'grad_norm': 6.854416720535083e-06, 'learning_rate': 4.444444444444444e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0047, 'grad_norm': 0.6118506249237797, 'learning_rate': 4.440917107583775e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.06608658875792642, 'learning_rate': 4.437389770723104e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0, 'grad_norm': 4.1763261294831867e-07, 'learning_rate': 4.433862433862434e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.0, 'grad_norm': 4.7791557442321277e-05, 'learning_rate': 4.430335097001764e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.2896, 'grad_norm': 17.3866709200926, 'learning_rate': 4.4268077601410936e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.044052727869887856, 'learning_rate': 4.423280423280424e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0023, 'grad_norm': 0.25145712535086556, 'learning_rate': 4.419753086419754e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0, 'grad_norm': 4.668486953269555e-05, 'learning_rate': 4.4162257495590835e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002699572995696303, 'learning_rate': 4.412698412698413e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0, 'grad_norm': 2.9188396854845996e-06, 'learning_rate': 4.409171075837743e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.06527835867917242, 'learning_rate': 4.405643738977073e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0063, 'grad_norm': 0.7818197461676119, 'learning_rate': 4.402116402116402e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00024206544816647964, 'learning_rate': 4.398589065255732e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03418126783804708, 'learning_rate': 4.395061728395062e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.004669129458521946, 'learning_rate': 4.3915343915343915e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006700829016194982, 'learning_rate': 4.388007054673722e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002415524794237387, 'learning_rate': 4.384479717813052e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0, 'grad_norm': 6.195730444453786e-05, 'learning_rate': 4.3809523809523815e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009099670719674699, 'learning_rate': 4.377425044091711e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03496300881659211, 'learning_rate': 4.373897707231041e-06, 'epoch': 1.21}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010615611916346828, 'learning_rate': 4.370370370370371e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0032864473467893824, 'learning_rate': 4.3668430335097e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.029561732058589005, 'learning_rate': 4.36331569664903e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011415167356451526, 'learning_rate': 4.35978835978836e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00014288646313221986, 'learning_rate': 4.3562610229276895e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0, 'grad_norm': 3.245649760417686e-05, 'learning_rate': 4.35273368606702e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0122, 'grad_norm': 1.937232152821435, 'learning_rate': 4.34920634920635e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0044704874907140904, 'learning_rate': 4.3456790123456795e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.1293, 'grad_norm': 13.307656996677174, 'learning_rate': 4.342151675485009e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.06888041494743545, 'learning_rate': 4.338624338624339e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005412197847395355, 'learning_rate': 4.335097001763669e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0266, 'grad_norm': 2.5817727841099325, 'learning_rate': 4.331569664902998e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010880026288884076, 'learning_rate': 4.328042328042328e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.024839449210722203, 'learning_rate': 4.324514991181658e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0, 'grad_norm': 1.8443272313258236e-05, 'learning_rate': 4.3209876543209875e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011088651619429035, 'learning_rate': 4.317460317460318e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005250269096078392, 'learning_rate': 4.313932980599648e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.151939571188003, 'learning_rate': 4.3104056437389775e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.08245356309932311, 'learning_rate': 4.306878306878307e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004637047167832956, 'learning_rate': 4.303350970017637e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000971189119649514, 'learning_rate': 4.2998236331569675e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005122368380373602, 'learning_rate': 4.296296296296296e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018504339488074927, 'learning_rate': 4.292768959435626e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000657366589089101, 'learning_rate': 4.289241622574956e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 4.9786964944852884e-05, 'learning_rate': 4.2857142857142855e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00021868802832008714, 'learning_rate': 4.282186948853616e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.0904772081604893, 'learning_rate': 4.278659611992946e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008820360021460959, 'learning_rate': 4.2751322751322754e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01893881518624238, 'learning_rate': 4.271604938271605e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001969397730978897, 'learning_rate': 4.268077601410935e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.20145815321833052, 'learning_rate': 4.2645502645502654e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.002, 'grad_norm': 0.2374246082355697, 'learning_rate': 4.261022927689595e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0022546095257575506, 'learning_rate': 4.257495590828925e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0, 'grad_norm': 1.390061809365733e-06, 'learning_rate': 4.2539682539682546e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006052944366002709, 'learning_rate': 4.250440917107584e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0, 'grad_norm': 2.324822919335577e-05, 'learning_rate': 4.246913580246914e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.017321005851764666, 'learning_rate': 4.243386243386244e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.024325601532022262, 'learning_rate': 4.239858906525573e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004421215899720343, 'learning_rate': 4.236331569664903e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00026804429438893245, 'learning_rate': 4.232804232804233e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0, 'grad_norm': 1.7073025352657647e-05, 'learning_rate': 4.229276895943563e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.09789036345085403, 'learning_rate': 4.225749559082893e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0844, 'grad_norm': 6.891364025526747, 'learning_rate': 4.222222222222223e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0, 'grad_norm': 5.147474380711694e-06, 'learning_rate': 4.2186948853615525e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0, 'grad_norm': 8.684976851431176e-06, 'learning_rate': 4.215167548500882e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012558333326420446, 'learning_rate': 4.211640211640212e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.0464, 'grad_norm': 3.5377585596279326, 'learning_rate': 4.208112874779542e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.1694, 'grad_norm': 8.834593005302278, 'learning_rate': 4.204585537918871e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010029619486773953, 'learning_rate': 4.201058201058201e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.1406, 'grad_norm': 8.598729026381784, 'learning_rate': 4.197530864197531e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04023161098559914, 'learning_rate': 4.194003527336861e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.061604709154820345, 'learning_rate': 4.190476190476191e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0, 'grad_norm': 6.231791890825954e-06, 'learning_rate': 4.186948853615521e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.1216, 'grad_norm': 6.715676247750453, 'learning_rate': 4.1834215167548505e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0, 'grad_norm': 9.513166927779785e-05, 'learning_rate': 4.17989417989418e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007598227258660525, 'learning_rate': 4.17636684303351e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0109, 'grad_norm': 1.0012566895565056, 'learning_rate': 4.17283950617284e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.08136845376635954, 'learning_rate': 4.169312169312169e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0038431840834196327, 'learning_rate': 4.165784832451499e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0723, 'grad_norm': 5.015250218729842, 'learning_rate': 4.162257495590829e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01632302585775683, 'learning_rate': 4.158730158730159e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0068312739745384245, 'learning_rate': 4.155202821869489e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0346, 'grad_norm': 2.079391437783151, 'learning_rate': 4.151675485008819e-06, 'epoch': 1.25}\n",
+      "{'loss': 0.0219, 'grad_norm': 3.050332010677889, 'learning_rate': 4.1481481481481485e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0, 'grad_norm': 1.857769515592765e-05, 'learning_rate': 4.144620811287478e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.023163250151843034, 'learning_rate': 4.141093474426808e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.0963764250881253, 'learning_rate': 4.137566137566138e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0027, 'grad_norm': 0.281453813772577, 'learning_rate': 4.134038800705467e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0, 'grad_norm': 4.102450430686009e-05, 'learning_rate': 4.130511463844797e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.026953427513470463, 'learning_rate': 4.126984126984127e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0, 'grad_norm': 3.905072688147763e-05, 'learning_rate': 4.123456790123457e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0096364040059521, 'learning_rate': 4.119929453262787e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0012, 'grad_norm': 0.18850897475715941, 'learning_rate': 4.116402116402117e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0, 'grad_norm': 1.7810996662126717e-05, 'learning_rate': 4.1128747795414465e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0, 'grad_norm': 4.224166692858535e-06, 'learning_rate': 4.109347442680776e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0046, 'grad_norm': 0.5888102742744137, 'learning_rate': 4.105820105820107e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007243583031272278, 'learning_rate': 4.1022927689594365e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0, 'grad_norm': 6.602156512133014e-05, 'learning_rate': 4.098765432098766e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0039113914750827684, 'learning_rate': 4.095238095238096e-06, 'epoch': 1.26}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001433085985920475, 'learning_rate': 4.091710758377425e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011856652362894716, 'learning_rate': 4.088183421516755e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0032, 'grad_norm': 0.36110348290961336, 'learning_rate': 4.084656084656085e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006086489886284664, 'learning_rate': 4.081128747795415e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00042222043902098797, 'learning_rate': 4.0776014109347444e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001421710438335741, 'learning_rate': 4.074074074074074e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0036, 'grad_norm': 0.37709892497135794, 'learning_rate': 4.070546737213405e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003576287627208978, 'learning_rate': 4.0670194003527344e-06, 'epoch': 1.27}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 63%|██████▎   | 2000/3150 [08:21<04:30,  4.25it/s]12/23/2024 06:43:28 - INFO - FlagEmbedding.finetune.embedder.encoder_only.base.trainer -   Saving model checkpoint to ./test_encoder_only_base_bge-large-en-v1.5/checkpoint-2000\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0001, 'grad_norm': 0.012578935167627541, 'learning_rate': 4.063492063492064e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 4.0970670215411106e-05, 'learning_rate': 4.059964726631394e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011658719653620064, 'learning_rate': 4.0564373897707236e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008131945372193306, 'learning_rate': 4.052910052910053e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00026534978358113776, 'learning_rate': 4.049382716049383e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 1.1366631129487001e-05, 'learning_rate': 4.045855379188713e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0116, 'grad_norm': 1.3234954028653214, 'learning_rate': 4.042328042328042e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05145979726251188, 'learning_rate': 4.038800705467372e-06, 'epoch': 1.27}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003372150780671462, 'learning_rate': 4.035273368606703e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016526051184216895, 'learning_rate': 4.031746031746032e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00046429687595757763, 'learning_rate': 4.028218694885362e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009080742416403165, 'learning_rate': 4.024691358024692e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0, 'grad_norm': 4.170996410800277e-07, 'learning_rate': 4.0211640211640215e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0056341060816663254, 'learning_rate': 4.017636684303351e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003476837383823824, 'learning_rate': 4.014109347442681e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0, 'grad_norm': 1.2932456203051463e-05, 'learning_rate': 4.010582010582011e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001667054280237365, 'learning_rate': 4.00705467372134e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0305, 'grad_norm': 3.214005909200386, 'learning_rate': 4.00352733686067e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.1406, 'grad_norm': 9.957375665262417, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0044, 'grad_norm': 0.4382931807962962, 'learning_rate': 3.99647266313933e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.044078655069264686, 'learning_rate': 3.99294532627866e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0, 'grad_norm': 1.76998786380777e-06, 'learning_rate': 3.98941798941799e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00978272029518602, 'learning_rate': 3.9858906525573195e-06, 'epoch': 1.28}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.1130127025644626, 'learning_rate': 3.982363315696649e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00027384888143693466, 'learning_rate': 3.978835978835979e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001357501575099601, 'learning_rate': 3.975308641975309e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018561665323238428, 'learning_rate': 3.971781305114638e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011872288444423105, 'learning_rate': 3.968253968253968e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0292, 'grad_norm': 2.48721189891218, 'learning_rate': 3.964726631393299e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0104, 'grad_norm': 1.222874777280617, 'learning_rate': 3.961199294532628e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009064794325900323, 'learning_rate': 3.957671957671958e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009378155674530883, 'learning_rate': 3.954144620811288e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0296, 'grad_norm': 3.8792426970857425, 'learning_rate': 3.9506172839506175e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0, 'grad_norm': 5.383315492547169e-06, 'learning_rate': 3.947089947089948e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00321624453984528, 'learning_rate': 3.943562610229277e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000508901259737745, 'learning_rate': 3.940035273368607e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0, 'grad_norm': 6.175520505608085e-05, 'learning_rate': 3.936507936507936e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0, 'grad_norm': 3.328684466498511e-06, 'learning_rate': 3.932980599647266e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0076, 'grad_norm': 0.496943504481454, 'learning_rate': 3.929453262786597e-06, 'epoch': 1.29}\n",
+      "{'loss': 0.0347, 'grad_norm': 3.5131371405567666, 'learning_rate': 3.925925925925926e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002503422782381375, 'learning_rate': 3.922398589065256e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011731753361970684, 'learning_rate': 3.918871252204586e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0038656862268593075, 'learning_rate': 3.9153439153439155e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002318315973215827, 'learning_rate': 3.911816578483246e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002216825025707688, 'learning_rate': 3.908289241622576e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00029305770302495547, 'learning_rate': 3.9047619047619055e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0084, 'grad_norm': 0.963058642726068, 'learning_rate': 3.901234567901235e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.18851907280347097, 'learning_rate': 3.897707231040565e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001246165642187544, 'learning_rate': 3.894179894179895e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02998428052565057, 'learning_rate': 3.890652557319224e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04066659518452237, 'learning_rate': 3.887125220458554e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0023534603119816236, 'learning_rate': 3.883597883597884e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001112300225116578, 'learning_rate': 3.8800705467372134e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 7.118388108184567e-05, 'learning_rate': 3.876543209876544e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 4.165418900310133e-06, 'learning_rate': 3.873015873015874e-06, 'epoch': 1.3}\n",
+      "{'loss': 0.0, 'grad_norm': 7.288626385252511e-06, 'learning_rate': 3.8694885361552034e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00029729638874398845, 'learning_rate': 3.865961199294533e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.05246448550649802, 'learning_rate': 3.862433862433863e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013518195092706055, 'learning_rate': 3.8589065255731926e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0032, 'grad_norm': 0.3448976843052749, 'learning_rate': 3.855379188712522e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0, 'grad_norm': 1.0908658880027973e-05, 'learning_rate': 3.851851851851852e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010983940575369172, 'learning_rate': 3.848324514991182e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0872, 'grad_norm': 11.253861506332434, 'learning_rate': 3.844797178130511e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0, 'grad_norm': 7.999734566921544e-06, 'learning_rate': 3.841269841269842e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.1824, 'grad_norm': 8.19363895343921, 'learning_rate': 3.837742504409172e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006462749540247018, 'learning_rate': 3.834215167548501e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0, 'grad_norm': 8.756691179607897e-05, 'learning_rate': 3.830687830687831e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0125, 'grad_norm': 1.2095130664171687, 'learning_rate': 3.827160493827161e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.001, 'grad_norm': 0.14274687774386957, 'learning_rate': 3.8236331569664905e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000388774154686567, 'learning_rate': 3.82010582010582e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.026227646934749632, 'learning_rate': 3.81657848324515e-06, 'epoch': 1.31}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.11808142254974845, 'learning_rate': 3.81305114638448e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01919913448109571, 'learning_rate': 3.80952380952381e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004083531918304264, 'learning_rate': 3.80599647266314e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015984192514464154, 'learning_rate': 3.8024691358024697e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005278615878233923, 'learning_rate': 3.7989417989417994e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005033259586935129, 'learning_rate': 3.795414462081129e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00039891621408594323, 'learning_rate': 3.791887125220459e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 6.448488805168442e-06, 'learning_rate': 3.788359788359789e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007266480056301328, 'learning_rate': 3.7848324514991187e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000841282650921329, 'learning_rate': 3.7813051146384484e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0251, 'grad_norm': 2.2022602738023758, 'learning_rate': 3.777777777777778e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004260186034976314, 'learning_rate': 3.774250440917108e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 3.432160879665568e-05, 'learning_rate': 3.770723104056438e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 4.821083028809395e-07, 'learning_rate': 3.7671957671957676e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 6.36190448945569e-05, 'learning_rate': 3.7636684303350974e-06, 'epoch': 1.32}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007083436448469876, 'learning_rate': 3.760141093474427e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.1307, 'grad_norm': 8.995499651328478, 'learning_rate': 3.7566137566137568e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011540753771249655, 'learning_rate': 3.753086419753087e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017934188786238698, 'learning_rate': 3.7495590828924166e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0026630256018836925, 'learning_rate': 3.7460317460317463e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0019788383572559905, 'learning_rate': 3.742504409171076e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0, 'grad_norm': 7.269220805473827e-05, 'learning_rate': 3.7389770723104058e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001803610254544812, 'learning_rate': 3.735449735449736e-06, 'epoch': 1.33}\n",
+      "{'loss': 1.0273, 'grad_norm': 34.462077209956455, 'learning_rate': 3.7319223985890656e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006422686905680291, 'learning_rate': 3.7283950617283953e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.017163179778524877, 'learning_rate': 3.724867724867725e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.09415983977439693, 'learning_rate': 3.7213403880070548e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0688, 'grad_norm': 8.731934563049526, 'learning_rate': 3.717813051146385e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0135, 'grad_norm': 1.595661064723562, 'learning_rate': 3.7142857142857146e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002380051843618939, 'learning_rate': 3.7107583774250443e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0, 'grad_norm': 8.795381204501326e-06, 'learning_rate': 3.707231040564374e-06, 'epoch': 1.33}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007258834831123965, 'learning_rate': 3.7037037037037037e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003228323495501856, 'learning_rate': 3.700176366843034e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.031197199695483818, 'learning_rate': 3.6966490299823636e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001380782793956965, 'learning_rate': 3.6931216931216933e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001530721995351247, 'learning_rate': 3.689594356261023e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04428473364231408, 'learning_rate': 3.6860670194003527e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0023925967627828498, 'learning_rate': 3.6825396825396833e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.9658, 'grad_norm': 31.80079828375972, 'learning_rate': 3.6790123456790126e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0031061334993138182, 'learning_rate': 3.6754850088183423e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.1089067226903172, 'learning_rate': 3.671957671957672e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02421263451235768, 'learning_rate': 3.6684303350970017e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000683460967924195, 'learning_rate': 3.6649029982363323e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.21596093580592832, 'learning_rate': 3.661375661375662e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0, 'grad_norm': 4.370775519172615e-05, 'learning_rate': 3.6578483245149917e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0, 'grad_norm': 1.8889832940890255e-06, 'learning_rate': 3.654320987654321e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.014771209258771224, 'learning_rate': 3.6507936507936507e-06, 'epoch': 1.34}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019942313045017841, 'learning_rate': 3.6472663139329813e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0, 'grad_norm': 9.002777475047074e-05, 'learning_rate': 3.643738977072311e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0027423125932599557, 'learning_rate': 3.6402116402116407e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00014894959232002902, 'learning_rate': 3.6366843033509704e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0, 'grad_norm': 9.87332229742095e-05, 'learning_rate': 3.6331569664903e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012501254881811031, 'learning_rate': 3.6296296296296302e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018329022366889372, 'learning_rate': 3.62610229276896e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0328, 'grad_norm': 4.230683353859634, 'learning_rate': 3.6225749559082897e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0031, 'grad_norm': 0.34273090805133205, 'learning_rate': 3.6190476190476194e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021415685987305675, 'learning_rate': 3.615520282186949e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0019389557427290525, 'learning_rate': 3.6119929453262792e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02012603186887081, 'learning_rate': 3.608465608465609e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.5161642953423848, 'learning_rate': 3.6049382716049387e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0066537019834517815, 'learning_rate': 3.6014109347442684e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0033826544346143026, 'learning_rate': 3.597883597883598e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.2269, 'grad_norm': 19.92534053803449, 'learning_rate': 3.5943562610229282e-06, 'epoch': 1.35}\n",
+      "{'loss': 0.0166, 'grad_norm': 2.3078458190868196, 'learning_rate': 3.590828924162258e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0, 'grad_norm': 7.892198285540463e-06, 'learning_rate': 3.5873015873015877e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.09287889409725965, 'learning_rate': 3.5837742504409174e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0, 'grad_norm': 4.0712020421593737e-05, 'learning_rate': 3.580246913580247e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0, 'grad_norm': 8.583946364130694e-06, 'learning_rate': 3.5767195767195772e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0316, 'grad_norm': 4.551620052457006, 'learning_rate': 3.573192239858907e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003815565345758032, 'learning_rate': 3.5696649029982366e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0044, 'grad_norm': 0.5463261711383511, 'learning_rate': 3.5661375661375664e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008552470449292058, 'learning_rate': 3.562610229276896e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004212763649829117, 'learning_rate': 3.559082892416226e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00028946154857470744, 'learning_rate': 3.555555555555556e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0206, 'grad_norm': 2.7357981438822976, 'learning_rate': 3.5520282186948856e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001278162786396002, 'learning_rate': 3.5485008818342153e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.04972838887264093, 'learning_rate': 3.544973544973545e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.015296316728874127, 'learning_rate': 3.541446208112875e-06, 'epoch': 1.36}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.14092512687537312, 'learning_rate': 3.537918871252205e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02885679479908465, 'learning_rate': 3.5343915343915346e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.38602396656651794, 'learning_rate': 3.5308641975308643e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.019426914276060962, 'learning_rate': 3.527336860670194e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004924574344874393, 'learning_rate': 3.523809523809524e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0022079770449268767, 'learning_rate': 3.520282186948854e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0, 'grad_norm': 7.454383147965024e-06, 'learning_rate': 3.5167548500881836e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.09799336420797465, 'learning_rate': 3.5132275132275133e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01894259977213991, 'learning_rate': 3.509700176366843e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002455709628687865, 'learning_rate': 3.5061728395061736e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0, 'grad_norm': 3.869823529062905e-05, 'learning_rate': 3.502645502645503e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0848, 'grad_norm': 3.6774558652178273, 'learning_rate': 3.4991181657848326e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0016, 'grad_norm': 0.20679871978052217, 'learning_rate': 3.4955908289241623e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0, 'grad_norm': 2.523118960808546e-05, 'learning_rate': 3.492063492063492e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.014769830964083502, 'learning_rate': 3.4885361552028226e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.16784974806321798, 'learning_rate': 3.4850088183421523e-06, 'epoch': 1.37}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007322383938882444, 'learning_rate': 3.481481481481482e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011297605629770727, 'learning_rate': 3.4779541446208113e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00773364345347797, 'learning_rate': 3.474426807760141e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 1.1868507709978623e-05, 'learning_rate': 3.4708994708994716e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0064, 'grad_norm': 0.885981551593972, 'learning_rate': 3.4673721340388013e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018606387547388322, 'learning_rate': 3.463844797178131e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0036, 'grad_norm': 0.5866502622817878, 'learning_rate': 3.4603174603174607e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011129705525857995, 'learning_rate': 3.4567901234567904e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00048054690021675193, 'learning_rate': 3.4532627865961205e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 5.4479906773283594e-05, 'learning_rate': 3.4497354497354503e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 6.524530220331813e-05, 'learning_rate': 3.44620811287478e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 7.574623474907023e-06, 'learning_rate': 3.4426807760141097e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015617055620535135, 'learning_rate': 3.4391534391534394e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007285727326130159, 'learning_rate': 3.4356261022927695e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 7.890164148593616e-05, 'learning_rate': 3.4320987654320992e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001415590235919119, 'learning_rate': 3.428571428571429e-06, 'epoch': 1.38}\n",
+      "{'loss': 0.0, 'grad_norm': 3.2100789970894784e-05, 'learning_rate': 3.4250440917107587e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0063, 'grad_norm': 0.8066291346435892, 'learning_rate': 3.4215167548500884e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.11675392049949962, 'learning_rate': 3.4179894179894185e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 8.465136554248184e-05, 'learning_rate': 3.4144620811287482e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.20231620534815764, 'learning_rate': 3.410934744268078e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 7.434836251704093e-06, 'learning_rate': 3.4074074074074077e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004006735044829069, 'learning_rate': 3.4038800705467374e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 3.042435532373723e-05, 'learning_rate': 3.4003527336860675e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.08735682191770028, 'learning_rate': 3.3968253968253972e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017244070299005085, 'learning_rate': 3.393298059964727e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003463269166343604, 'learning_rate': 3.3897707231040566e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017074818237754856, 'learning_rate': 3.3862433862433864e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 7.253805069374262e-06, 'learning_rate': 3.3827160493827165e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005140038051667341, 'learning_rate': 3.379188712522046e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006698987467821694, 'learning_rate': 3.375661375661376e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013248304536722798, 'learning_rate': 3.3721340388007056e-06, 'epoch': 1.39}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021626747824506527, 'learning_rate': 3.3686067019400353e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002584826738414308, 'learning_rate': 3.3650793650793655e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009365238991523105, 'learning_rate': 3.361552028218695e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002118054972896326, 'learning_rate': 3.358024691358025e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 1.5237730927500084e-06, 'learning_rate': 3.3544973544973546e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003392474060153105, 'learning_rate': 3.3509700176366843e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05362289755902389, 'learning_rate': 3.3474426807760145e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00034013556486311715, 'learning_rate': 3.343915343915344e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.3657, 'grad_norm': 22.85611866972455, 'learning_rate': 3.340388007054674e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0023699754661469404, 'learning_rate': 3.3368606701940036e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.007, 'grad_norm': 0.8282786532672389, 'learning_rate': 3.3333333333333333e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 6.134207029008878e-05, 'learning_rate': 3.3298059964726635e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.07422478550914971, 'learning_rate': 3.326278659611993e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 5.901144396688934e-06, 'learning_rate': 3.322751322751323e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00025462705206333834, 'learning_rate': 3.3192239858906526e-06, 'epoch': 1.4}\n",
+      "{'loss': 0.0, 'grad_norm': 3.47899396553051e-06, 'learning_rate': 3.3156966490299823e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00042801892469743354, 'learning_rate': 3.312169312169313e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.23767491882403485, 'learning_rate': 3.3086419753086426e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0, 'grad_norm': 4.1568292644456344e-07, 'learning_rate': 3.3051146384479723e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0656, 'grad_norm': 5.718211998152093, 'learning_rate': 3.3015873015873016e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0, 'grad_norm': 6.579545249986309e-05, 'learning_rate': 3.2980599647266313e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006902647795653524, 'learning_rate': 3.294532627865962e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005573621591435287, 'learning_rate': 3.2910052910052916e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04215338865486932, 'learning_rate': 3.2874779541446213e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0, 'grad_norm': 2.8652864909777126e-05, 'learning_rate': 3.283950617283951e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006844466021701656, 'learning_rate': 3.2804232804232807e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02097578369369572, 'learning_rate': 3.276895943562611e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.027907623357274973, 'learning_rate': 3.2733686067019406e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002677902234329869, 'learning_rate': 3.2698412698412703e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.046578857625659296, 'learning_rate': 3.2663139329806e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0175, 'grad_norm': 1.7768318529019391, 'learning_rate': 3.2627865961199297e-06, 'epoch': 1.41}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01596309651067365, 'learning_rate': 3.25925925925926e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0088, 'grad_norm': 1.0414267581600227, 'learning_rate': 3.2557319223985895e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00045817342563871067, 'learning_rate': 3.2522045855379193e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008257969860453905, 'learning_rate': 3.248677248677249e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00030828709277101003, 'learning_rate': 3.2451499118165787e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022049406474201576, 'learning_rate': 3.241622574955909e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.03297421668535729, 'learning_rate': 3.2380952380952385e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.06763599693338382, 'learning_rate': 3.2345679012345682e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.4102, 'grad_norm': 33.65505307760262, 'learning_rate': 3.231040564373898e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0055, 'grad_norm': 0.84653490141939, 'learning_rate': 3.2275132275132277e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0, 'grad_norm': 2.3385771329576106e-06, 'learning_rate': 3.223985890652558e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010680264708557253, 'learning_rate': 3.2204585537918875e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.14202907744194918, 'learning_rate': 3.2169312169312172e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018984271255134146, 'learning_rate': 3.213403880070547e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018546244547666205, 'learning_rate': 3.2098765432098767e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.09778856373826056, 'learning_rate': 3.206349206349207e-06, 'epoch': 1.42}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008361516637371207, 'learning_rate': 3.2028218694885365e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0034338925300414908, 'learning_rate': 3.1992945326278662e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.22, 'grad_norm': 10.365980157066636, 'learning_rate': 3.195767195767196e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0029598183907788454, 'learning_rate': 3.1922398589065256e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007502887601392175, 'learning_rate': 3.1887125220458558e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0036, 'grad_norm': 0.3006645446868634, 'learning_rate': 3.1851851851851855e-06, 'epoch': 1.43}\n",
+      "{'loss': 1.1592, 'grad_norm': 28.40788292886325, 'learning_rate': 3.181657848324515e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01254880310030123, 'learning_rate': 3.178130511463845e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03921559107119951, 'learning_rate': 3.1746031746031746e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00021084307975878167, 'learning_rate': 3.1710758377425048e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0, 'grad_norm': 6.79170663969747e-06, 'learning_rate': 3.1675485008818345e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0021690081717920936, 'learning_rate': 3.164021164021164e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04515570445539665, 'learning_rate': 3.160493827160494e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0867, 'grad_norm': 4.513364866438624, 'learning_rate': 3.1569664902998236e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019414769067375661, 'learning_rate': 3.1534391534391538e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005440774922671246, 'learning_rate': 3.1499118165784835e-06, 'epoch': 1.43}\n",
+      "{'loss': 0.4709, 'grad_norm': 14.214598466728228, 'learning_rate': 3.146384479717813e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.05760069586842938, 'learning_rate': 3.142857142857143e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005998649960926443, 'learning_rate': 3.1393298059964726e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 2.730710845159607e-06, 'learning_rate': 3.135802469135803e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010912559269584436, 'learning_rate': 3.132275132275133e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.037670974335807246, 'learning_rate': 3.1287477954144626e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 4.385223635370109e-05, 'learning_rate': 3.125220458553792e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018964849228679295, 'learning_rate': 3.1216931216931216e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.002, 'grad_norm': 0.336699364849842, 'learning_rate': 3.118165784832452e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003402511629388774, 'learning_rate': 3.114638447971782e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015035175650680532, 'learning_rate': 3.1111111111111116e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000584013986644131, 'learning_rate': 3.1075837742504413e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008917964965648443, 'learning_rate': 3.104056437389771e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00012169705638181359, 'learning_rate': 3.100529100529101e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.4062, 'grad_norm': 20.56547548048139, 'learning_rate': 3.097001763668431e-06, 'epoch': 1.44}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007914445416584449, 'learning_rate': 3.0934744268077606e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015331599743789832, 'learning_rate': 3.0899470899470903e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 2.579796857351788e-05, 'learning_rate': 3.08641975308642e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0012, 'grad_norm': 0.14683450815368224, 'learning_rate': 3.08289241622575e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00021264980135498956, 'learning_rate': 3.07936507936508e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 9.518565049082229e-06, 'learning_rate': 3.0758377425044096e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 4.230346890509243e-05, 'learning_rate': 3.0723104056437393e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00046304857953536645, 'learning_rate': 3.068783068783069e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005085454879077761, 'learning_rate': 3.065255731922399e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 1.268607412948354e-06, 'learning_rate': 3.061728395061729e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 2.5434231747057627e-05, 'learning_rate': 3.0582010582010585e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 3.640798308521352e-05, 'learning_rate': 3.0546737213403883e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.014182125100745848, 'learning_rate': 3.051146384479718e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004780171962416684, 'learning_rate': 3.047619047619048e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.057497058924710416, 'learning_rate': 3.044091710758378e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00027380243321475445, 'learning_rate': 3.0405643738977075e-06, 'epoch': 1.45}\n",
+      "{'loss': 0.0113, 'grad_norm': 1.5693254547623599, 'learning_rate': 3.0370370370370372e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 4.2533395304621724e-07, 'learning_rate': 3.033509700176367e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 5.960675884014405e-05, 'learning_rate': 3.029982363315697e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.09047323480172968, 'learning_rate': 3.026455026455027e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010061005824485249, 'learning_rate': 3.0229276895943565e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002647842949367073, 'learning_rate': 3.0194003527336862e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 5.677861829539425e-06, 'learning_rate': 3.015873015873016e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.033870019700798455, 'learning_rate': 3.012345679012346e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019282370665689812, 'learning_rate': 3.008818342151676e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 3.991687667550919e-06, 'learning_rate': 3.0052910052910055e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007687415469992445, 'learning_rate': 3.0017636684303352e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004339281857200881, 'learning_rate': 2.998236331569665e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003778253638370211, 'learning_rate': 2.9947089947089946e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 2.6665129372781757e-05, 'learning_rate': 2.9911816578483248e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0, 'grad_norm': 2.4053049880230055e-05, 'learning_rate': 2.9876543209876545e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.0335109827743625, 'learning_rate': 2.984126984126984e-06, 'epoch': 1.46}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021064017039972516, 'learning_rate': 2.980599647266314e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00031259210292468087, 'learning_rate': 2.9770723104056436e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007820347108368984, 'learning_rate': 2.9735449735449738e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00048684774660822007, 'learning_rate': 2.9700176366843035e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0, 'grad_norm': 1.0899199930400296e-05, 'learning_rate': 2.966490299823633e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04307775191205004, 'learning_rate': 2.962962962962963e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.016789151373202412, 'learning_rate': 2.9594356261022926e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04038623724417128, 'learning_rate': 2.955908289241623e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.06544019130830676, 'learning_rate': 2.9523809523809525e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0, 'grad_norm': 2.4109473271687417e-05, 'learning_rate': 2.948853615520282e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.09194715859722351, 'learning_rate': 2.945326278659612e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0, 'grad_norm': 4.455464639757135e-05, 'learning_rate': 2.9417989417989416e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0, 'grad_norm': 7.871233425165205e-05, 'learning_rate': 2.938271604938272e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002670208656245146, 'learning_rate': 2.934744268077602e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00043199382360567756, 'learning_rate': 2.9312169312169316e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.025138885081248577, 'learning_rate': 2.9276895943562613e-06, 'epoch': 1.47}\n",
+      "{'loss': 0.0, 'grad_norm': 1.5189428421868423e-05, 'learning_rate': 2.9241622574955906e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00028207875311963675, 'learning_rate': 2.920634920634921e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0, 'grad_norm': 2.6145722187130794e-05, 'learning_rate': 2.917107583774251e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.1475, 'grad_norm': 6.030405568516995, 'learning_rate': 2.9135802469135806e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004270171718723482, 'learning_rate': 2.9100529100529103e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0019, 'grad_norm': 0.23575723279272495, 'learning_rate': 2.90652557319224e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.03303715301903196, 'learning_rate': 2.90299823633157e-06, 'epoch': 1.48}\n",
+      "{'loss': 1.3916, 'grad_norm': 28.716611896742364, 'learning_rate': 2.8994708994709e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018450360297515298, 'learning_rate': 2.8959435626102296e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0, 'grad_norm': 3.2658346743320666e-05, 'learning_rate': 2.8924162257495593e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0185, 'grad_norm': 2.619884228951331, 'learning_rate': 2.888888888888889e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009307744854057607, 'learning_rate': 2.885361552028219e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002606253632507715, 'learning_rate': 2.881834215167549e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00820462237681794, 'learning_rate': 2.8783068783068786e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008065107457961979, 'learning_rate': 2.8747795414462083e-06, 'epoch': 1.48}\n",
+      "{'loss': 0.0, 'grad_norm': 2.6962570320025856e-05, 'learning_rate': 2.871252204585538e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.03192855397098482, 'learning_rate': 2.867724867724868e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022173301984594187, 'learning_rate': 2.864197530864198e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00012512698748549803, 'learning_rate': 2.8606701940035275e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00020959621408115064, 'learning_rate': 2.8571428571428573e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 8.074147105614219e-07, 'learning_rate': 2.853615520282187e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003864403065844997, 'learning_rate': 2.850088183421517e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 5.02304511567028e-06, 'learning_rate': 2.846560846560847e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0041753203404013076, 'learning_rate': 2.8430335097001765e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011662612727624685, 'learning_rate': 2.8395061728395062e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.2896, 'grad_norm': 12.263546674040393, 'learning_rate': 2.835978835978836e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012641722437361782, 'learning_rate': 2.832451499118166e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00013542428452936102, 'learning_rate': 2.828924162257496e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04234585177696919, 'learning_rate': 2.8253968253968255e-06, 'epoch': 1.49}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 75%|███████▍  | 2352/3150 [09:50<02:43,  4.89it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0, 'grad_norm': 0.001067494846590443, 'learning_rate': 2.8218694885361552e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0019456256210602489, 'learning_rate': 2.818342151675485e-06, 'epoch': 1.49}\n",
+      "{'loss': 0.003, 'grad_norm': 0.3920454900412361, 'learning_rate': 2.814814814814815e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 1.4884702153316084e-05, 'learning_rate': 2.811287477954145e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000446312017993157, 'learning_rate': 2.8077601410934745e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011505404281340935, 'learning_rate': 2.8042328042328042e-06, 'epoch': 1.5}\n",
+      "{'loss': 1.3574, 'grad_norm': 22.403333742837667, 'learning_rate': 2.800705467372134e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0055, 'grad_norm': 0.5809690268068924, 'learning_rate': 2.797178130511464e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002727443025917454, 'learning_rate': 2.7936507936507938e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00023945617647247515, 'learning_rate': 2.7901234567901235e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 3.092883570111503e-06, 'learning_rate': 2.786596119929453e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0027290091688050274, 'learning_rate': 2.783068783068783e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.19708339848090528, 'learning_rate': 2.7795414462081135e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021264053970375216, 'learning_rate': 2.7760141093474428e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003340116870431908, 'learning_rate': 2.7724867724867725e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 9.169761502275282e-05, 'learning_rate': 2.768959435626102e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00020704895749528286, 'learning_rate': 2.765432098765432e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005517328919704107, 'learning_rate': 2.7619047619047625e-06, 'epoch': 1.5}\n",
+      "{'loss': 0.0, 'grad_norm': 4.905203768095536e-06, 'learning_rate': 2.758377425044092e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0022497526815364044, 'learning_rate': 2.754850088183422e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018939158044833422, 'learning_rate': 2.7513227513227516e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01587002183228159, 'learning_rate': 2.747795414462081e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02214727160246167, 'learning_rate': 2.7442680776014115e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0, 'grad_norm': 2.6595077596027427e-06, 'learning_rate': 2.740740740740741e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004887874233931975, 'learning_rate': 2.737213403880071e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009314845154158948, 'learning_rate': 2.7336860670194006e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0039, 'grad_norm': 0.456949715799706, 'learning_rate': 2.7301587301587303e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.04149736465063521, 'learning_rate': 2.7266313932980604e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0075334314521293255, 'learning_rate': 2.72310405643739e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.05196251067374117, 'learning_rate': 2.71957671957672e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016806805618107813, 'learning_rate': 2.7160493827160496e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.012, 'grad_norm': 1.3247870292088497, 'learning_rate': 2.7125220458553793e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0, 'grad_norm': 4.7602304609279785e-05, 'learning_rate': 2.7089947089947094e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011422876402439903, 'learning_rate': 2.705467372134039e-06, 'epoch': 1.51}\n",
+      "{'loss': 0.0337, 'grad_norm': 4.157785461865417, 'learning_rate': 2.701940035273369e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011855224123203816, 'learning_rate': 2.6984126984126986e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006666358701915959, 'learning_rate': 2.6948853615520283e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 6.74332190375177e-05, 'learning_rate': 2.6913580246913584e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 1.1246792080297375e-05, 'learning_rate': 2.687830687830688e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008914679399603992, 'learning_rate': 2.684303350970018e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018100833993063058, 'learning_rate': 2.6807760141093476e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014763428249026517, 'learning_rate': 2.6772486772486773e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002346836440057346, 'learning_rate': 2.6737213403880074e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006520104986758714, 'learning_rate': 2.670194003527337e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.07942014283462669, 'learning_rate': 2.666666666666667e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 7.96580064075826e-05, 'learning_rate': 2.6631393298059965e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00033914629099019594, 'learning_rate': 2.6596119929453263e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0019337763957436745, 'learning_rate': 2.6560846560846564e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0020308946320990117, 'learning_rate': 2.652557319223986e-06, 'epoch': 1.52}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.047810434650554505, 'learning_rate': 2.649029982363316e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3382338929850441e-05, 'learning_rate': 2.6455026455026455e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001059326667840751, 'learning_rate': 2.6419753086419752e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.1927535072853628, 'learning_rate': 2.6384479717813054e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0057, 'grad_norm': 0.6724126350279447, 'learning_rate': 2.634920634920635e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 4.6433694391304115e-05, 'learning_rate': 2.631393298059965e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01742610329210908, 'learning_rate': 2.6278659611992945e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 8.420819722053794e-07, 'learning_rate': 2.6243386243386242e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 5.871633353867689e-06, 'learning_rate': 2.6208112874779544e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0025541181373878357, 'learning_rate': 2.617283950617284e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003414959369681774, 'learning_rate': 2.613756613756614e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005486800737155373, 'learning_rate': 2.6102292768959435e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00048733805887308764, 'learning_rate': 2.6067019400352732e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00021257150687811497, 'learning_rate': 2.6031746031746038e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 3.012839670184627e-05, 'learning_rate': 2.599647266313933e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 7.825763853852161e-05, 'learning_rate': 2.5961199294532628e-06, 'epoch': 1.53}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015861615187050622, 'learning_rate': 2.5925925925925925e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0035497253105659667, 'learning_rate': 2.589065255731922e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 5.454199676015437e-06, 'learning_rate': 2.5855379188712528e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010251732201766665, 'learning_rate': 2.5820105820105825e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 2.9374932712049338e-06, 'learning_rate': 2.578483245149912e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018164389097808232, 'learning_rate': 2.574955908289242e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0378, 'grad_norm': 5.258270289087988, 'learning_rate': 2.571428571428571e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 3.746790352667127e-06, 'learning_rate': 2.5679012345679018e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05434644523792538, 'learning_rate': 2.5643738977072315e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0437, 'grad_norm': 4.886745486350221, 'learning_rate': 2.560846560846561e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3493378921663138e-05, 'learning_rate': 2.557319223985891e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003024190366928949, 'learning_rate': 2.5537918871252206e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013020777938924865, 'learning_rate': 2.5502645502645507e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 1.2649586763230201e-05, 'learning_rate': 2.5467372134038805e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010306164910738283, 'learning_rate': 2.54320987654321e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.09340646007325334, 'learning_rate': 2.53968253968254e-06, 'epoch': 1.54}\n",
+      "{'loss': 0.0, 'grad_norm': 2.7993148063132336e-05, 'learning_rate': 2.5361552028218696e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016998721030410315, 'learning_rate': 2.5326278659611997e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0497, 'grad_norm': 5.279654538383902, 'learning_rate': 2.5291005291005294e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007020469771886021, 'learning_rate': 2.525573192239859e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.16469183629278997, 'learning_rate': 2.522045855379189e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017455491560409212, 'learning_rate': 2.5185185185185186e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001255097483252382, 'learning_rate': 2.5149911816578487e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05164729320566616, 'learning_rate': 2.5114638447971784e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 1.2563676154934973e-05, 'learning_rate': 2.507936507936508e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 4.53464280205773e-06, 'learning_rate': 2.504409171075838e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0057404517739273, 'learning_rate': 2.5008818342151676e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0026, 'grad_norm': 0.21186022799823975, 'learning_rate': 2.4973544973544973e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 6.826701554129248e-05, 'learning_rate': 2.4938271604938274e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009075660259099639, 'learning_rate': 2.490299823633157e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0019486354636758455, 'learning_rate': 2.486772486772487e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0, 'grad_norm': 9.951032444266097e-06, 'learning_rate': 2.483245149911817e-06, 'epoch': 1.55}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009079078064711983, 'learning_rate': 2.4797178130511467e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004221730130390719, 'learning_rate': 2.4761904761904764e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006272460906245377, 'learning_rate': 2.472663139329806e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3964351430734386e-05, 'learning_rate': 2.469135802469136e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003008231359409712, 'learning_rate': 2.465608465608466e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006982613082675657, 'learning_rate': 2.4620811287477957e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0016, 'grad_norm': 0.15047590513042705, 'learning_rate': 2.4585537918871254e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017598526613240132, 'learning_rate': 2.455026455026455e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0378, 'grad_norm': 4.735509352553851, 'learning_rate': 2.451499118165785e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 2.2110835361017898e-05, 'learning_rate': 2.447971781305115e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0832, 'grad_norm': 9.661291886341388, 'learning_rate': 2.4444444444444447e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 2.2900871949808142e-07, 'learning_rate': 2.4409171075837744e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.2218, 'grad_norm': 19.03221932183562, 'learning_rate': 2.437389770723104e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018110558184389534, 'learning_rate': 2.433862433862434e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 9.911388451000248e-06, 'learning_rate': 2.430335097001764e-06, 'epoch': 1.56}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006807174023554377, 'learning_rate': 2.4268077601410937e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0855, 'grad_norm': 9.76332199406858, 'learning_rate': 2.4232804232804234e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0, 'grad_norm': 7.960801410277151e-05, 'learning_rate': 2.419753086419753e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009860339410583015, 'learning_rate': 2.416225749559083e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.020422773362789217, 'learning_rate': 2.412698412698413e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015278887922665087, 'learning_rate': 2.4091710758377426e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0027, 'grad_norm': 0.27699121716326275, 'learning_rate': 2.4056437389770728e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0068, 'grad_norm': 0.6784253075135332, 'learning_rate': 2.4021164021164025e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0, 'grad_norm': 1.5185064641428105e-06, 'learning_rate': 2.3985890652557318e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006195021077369671, 'learning_rate': 2.395061728395062e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0, 'grad_norm': 1.526432233449919e-05, 'learning_rate': 2.3915343915343916e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.4189, 'grad_norm': 12.23288512140757, 'learning_rate': 2.3880070546737218e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0021, 'grad_norm': 0.16136282485313377, 'learning_rate': 2.3844797178130515e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0066, 'grad_norm': 0.7360621207107431, 'learning_rate': 2.380952380952381e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3612954487848667e-07, 'learning_rate': 2.377425044091711e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004567785192486753, 'learning_rate': 2.3738977072310406e-06, 'epoch': 1.57}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0058301963441730244, 'learning_rate': 2.3703703703703707e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019829644772976083, 'learning_rate': 2.3668430335097005e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009108424323222997, 'learning_rate': 2.36331569664903e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03491728750922038, 'learning_rate': 2.35978835978836e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.2131, 'grad_norm': 13.93147072152986, 'learning_rate': 2.3562610229276896e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 4.937264276245653e-05, 'learning_rate': 2.3527336860670197e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00254040866365097, 'learning_rate': 2.3492063492063494e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0665, 'grad_norm': 8.149436940594759, 'learning_rate': 2.345679012345679e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006179026059899977, 'learning_rate': 2.342151675485009e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 2.573927506769347e-06, 'learning_rate': 2.3386243386243386e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.079, 'grad_norm': 6.8982375948703885, 'learning_rate': 2.3350970017636687e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 6.81762603038695e-06, 'learning_rate': 2.3315696649029984e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 9.516661332320596e-05, 'learning_rate': 2.328042328042328e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011146373795598834, 'learning_rate': 2.3245149911816583e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019821306624122147, 'learning_rate': 2.3209876543209876e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 5.696238377050539e-05, 'learning_rate': 2.3174603174603177e-06, 'epoch': 1.58}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0044571831950861555, 'learning_rate': 2.3139329805996474e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016651671000516064, 'learning_rate': 2.310405643738977e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.045650522820642775, 'learning_rate': 2.3068783068783073e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0, 'grad_norm': 8.992574110926768e-05, 'learning_rate': 2.303350970017637e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.18757386540760326, 'learning_rate': 2.2998236331569667e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003666851703367774, 'learning_rate': 2.2962962962962964e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0, 'grad_norm': 9.986192891361487e-07, 'learning_rate': 2.292768959435626e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017636870281781995, 'learning_rate': 2.2892416225749563e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.3113, 'grad_norm': 12.370497992443743, 'learning_rate': 2.285714285714286e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009395849880482816, 'learning_rate': 2.2821869488536157e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022280744807339541, 'learning_rate': 2.2786596119929454e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0, 'grad_norm': 4.634458052416317e-05, 'learning_rate': 2.275132275132275e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.035554412681893834, 'learning_rate': 2.2716049382716052e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03382290182029125, 'learning_rate': 2.268077601410935e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0, 'grad_norm': 1.4042366028248359e-05, 'learning_rate': 2.2645502645502647e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017629798918001836, 'learning_rate': 2.2610229276895944e-06, 'epoch': 1.59}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.12686340389439, 'learning_rate': 2.257495590828924e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012516429525915446, 'learning_rate': 2.2539682539682542e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 4.941929482878722e-06, 'learning_rate': 2.250440917107584e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02211244410183765, 'learning_rate': 2.2469135802469137e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0028809287277052798, 'learning_rate': 2.2433862433862434e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021387003516779163, 'learning_rate': 2.239858906525573e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011766148491239345, 'learning_rate': 2.2363315696649032e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004185362256708367, 'learning_rate': 2.232804232804233e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001337265347505097, 'learning_rate': 2.229276895943563e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008319218491858049, 'learning_rate': 2.2257495590828928e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006526416941880093, 'learning_rate': 2.222222222222222e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.19622887215202572, 'learning_rate': 2.218694885361552e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00014998835285247455, 'learning_rate': 2.215167548500882e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0025549833773884857, 'learning_rate': 2.211640211640212e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022922070291820469, 'learning_rate': 2.2081128747795418e-06, 'epoch': 1.6}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015571720280714225, 'learning_rate': 2.2045855379188715e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.03141945662965893, 'learning_rate': 2.201058201058201e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.49694154041918503, 'learning_rate': 2.197530864197531e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 8.078773622834898e-05, 'learning_rate': 2.194003527336861e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 7.89548877882176e-05, 'learning_rate': 2.1904761904761908e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00021188897080023068, 'learning_rate': 2.1869488536155205e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 5.324585872087369e-05, 'learning_rate': 2.18342151675485e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005499360330455849, 'learning_rate': 2.17989417989418e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.1146, 'grad_norm': 11.228714876634891, 'learning_rate': 2.17636684303351e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0743, 'grad_norm': 10.108058743522419, 'learning_rate': 2.1728395061728397e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001260467460238435, 'learning_rate': 2.1693121693121695e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 6.0915165081880466e-05, 'learning_rate': 2.165784832451499e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0019, 'grad_norm': 0.27940758598306037, 'learning_rate': 2.162257495590829e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00033041752980255497, 'learning_rate': 2.158730158730159e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003891311864870363, 'learning_rate': 2.1552028218694887e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006958191329644049, 'learning_rate': 2.1516754850088184e-06, 'epoch': 1.61}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012091214364195787, 'learning_rate': 2.148148148148148e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012349786497128241, 'learning_rate': 2.144620811287478e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0060556898325676446, 'learning_rate': 2.141093474426808e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0056370339226608, 'learning_rate': 2.1375661375661377e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006756552555690485, 'learning_rate': 2.1340388007054674e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.019573662573691666, 'learning_rate': 2.1305114638447976e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00032645007410282234, 'learning_rate': 2.1269841269841273e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010894146720618194, 'learning_rate': 2.123456790123457e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003685475637354514, 'learning_rate': 2.1199294532627867e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0, 'grad_norm': 3.9179128675984593e-05, 'learning_rate': 2.1164021164021164e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.2817, 'grad_norm': 20.671453516347505, 'learning_rate': 2.1128747795414466e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.366, 'grad_norm': 29.76498420406046, 'learning_rate': 2.1093474426807763e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.003, 'grad_norm': 0.32127630793986706, 'learning_rate': 2.105820105820106e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0, 'grad_norm': 5.994035015417779e-05, 'learning_rate': 2.1022927689594357e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015317977164872652, 'learning_rate': 2.0987654320987654e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.8711, 'grad_norm': 23.832523927971703, 'learning_rate': 2.0952380952380955e-06, 'epoch': 1.62}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01908084193494264, 'learning_rate': 2.0917107583774253e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.08678442952918536, 'learning_rate': 2.088183421516755e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0052, 'grad_norm': 0.6165159410703603, 'learning_rate': 2.0846560846560847e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007379620843125557, 'learning_rate': 2.0811287477954144e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.3059, 'grad_norm': 11.764506374238241, 'learning_rate': 2.0776014109347445e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.14664429518263342, 'learning_rate': 2.0740740740740742e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0073792311350031, 'learning_rate': 2.070546737213404e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008987088325577494, 'learning_rate': 2.0670194003527337e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.665, 'grad_norm': 16.69889299755376, 'learning_rate': 2.0634920634920634e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.1733, 'grad_norm': 6.796392875321633, 'learning_rate': 2.0599647266313935e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00042995027898858106, 'learning_rate': 2.0564373897707232e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009967968921486098, 'learning_rate': 2.0529100529100534e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0024, 'grad_norm': 0.2167105975608232, 'learning_rate': 2.049382716049383e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005600418167447965, 'learning_rate': 2.0458553791887124e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0, 'grad_norm': 3.783368146255796e-05, 'learning_rate': 2.0423280423280425e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001148384758618757, 'learning_rate': 2.0388007054673722e-06, 'epoch': 1.63}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003074478875768616, 'learning_rate': 2.0352733686067024e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0034333270111508722, 'learning_rate': 2.031746031746032e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.3337, 'grad_norm': 18.53403867481692, 'learning_rate': 2.0282186948853618e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.013211046220187119, 'learning_rate': 2.0246913580246915e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.021442756385413885, 'learning_rate': 2.021164021164021e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.013726433710313415, 'learning_rate': 2.0176366843033513e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0918, 'grad_norm': 11.996509680187648, 'learning_rate': 2.014109347442681e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007290868216461185, 'learning_rate': 2.0105820105820108e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.006172895095406455, 'learning_rate': 2.0070546737213405e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002699311619880056, 'learning_rate': 2.00352733686067e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.1044453177105756, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0699, 'grad_norm': 8.13140100548557, 'learning_rate': 1.99647266313933e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.10662194130262445, 'learning_rate': 1.9929453262786598e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002970856682092152, 'learning_rate': 1.9894179894179895e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003918231273024884, 'learning_rate': 1.985890652557319e-06, 'epoch': 1.64}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014722503051242401, 'learning_rate': 1.9823633156966493e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.07403734272819869, 'learning_rate': 1.978835978835979e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0, 'grad_norm': 4.7382421725524024e-05, 'learning_rate': 1.9753086419753087e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.08199980110233016, 'learning_rate': 1.9717813051146385e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.018397426328135975, 'learning_rate': 1.968253968253968e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0, 'grad_norm': 9.30529113366646e-05, 'learning_rate': 1.9647266313932983e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00522908141633704, 'learning_rate': 1.961199294532628e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002460127258189104, 'learning_rate': 1.9576719576719577e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010497278982282328, 'learning_rate': 1.954144620811288e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.20434221198633024, 'learning_rate': 1.9506172839506176e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0272, 'grad_norm': 3.2645556875753052, 'learning_rate': 1.9470899470899473e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016874618927431126, 'learning_rate': 1.943562610229277e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0224, 'grad_norm': 3.0190890483125243, 'learning_rate': 1.9400352733686067e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006339172966810028, 'learning_rate': 1.936507936507937e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03623767887717667, 'learning_rate': 1.9329805996472666e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003640193479438905, 'learning_rate': 1.9294532627865963e-06, 'epoch': 1.65}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010748415396023227, 'learning_rate': 1.925925925925926e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0038823951406127656, 'learning_rate': 1.9223985890652557e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03896848122629093, 'learning_rate': 1.918871252204586e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.0394095294460921, 'learning_rate': 1.9153439153439156e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001157740825891082, 'learning_rate': 1.9118165784832453e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001218450598871926, 'learning_rate': 1.908289241622575e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.024509873738705137, 'learning_rate': 1.904761904761905e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.11997115554693212, 'learning_rate': 1.9012345679012348e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0, 'grad_norm': 2.7444355469576363e-06, 'learning_rate': 1.8977072310405645e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0, 'grad_norm': 2.012858175025531e-06, 'learning_rate': 1.8941798941798945e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01452647637879181, 'learning_rate': 1.8906525573192242e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.1412, 'grad_norm': 12.025274997667672, 'learning_rate': 1.887125220458554e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0026, 'grad_norm': 0.4461211341429617, 'learning_rate': 1.8835978835978838e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.028656805147149137, 'learning_rate': 1.8800705467372135e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0, 'grad_norm': 5.130873581682497e-05, 'learning_rate': 1.8765432098765435e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0, 'grad_norm': 2.513946460072352e-05, 'learning_rate': 1.8730158730158732e-06, 'epoch': 1.66}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002798235764569295, 'learning_rate': 1.8694885361552029e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 3.992567168615808e-06, 'learning_rate': 1.8659611992945328e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0034, 'grad_norm': 0.42332583429547993, 'learning_rate': 1.8624338624338625e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.04980190587223252, 'learning_rate': 1.8589065255731924e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010979972762781457, 'learning_rate': 1.8553791887125222e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.1357, 'grad_norm': 9.36044116944916, 'learning_rate': 1.8518518518518519e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0027856650477618376, 'learning_rate': 1.8483245149911818e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0028420145872194136, 'learning_rate': 1.8447971781305115e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002987178693962375, 'learning_rate': 1.8412698412698416e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.0286473575009549, 'learning_rate': 1.8377425044091711e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00010965769686019278, 'learning_rate': 1.8342151675485009e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00038733261053290656, 'learning_rate': 1.830687830687831e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 8.29080174379057e-05, 'learning_rate': 1.8271604938271605e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 3.687232983708937e-06, 'learning_rate': 1.8236331569664906e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.013553490008685014, 'learning_rate': 1.8201058201058203e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 8.748768054589306e-05, 'learning_rate': 1.81657848324515e-06, 'epoch': 1.67}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006136490827846181, 'learning_rate': 1.81305114638448e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0024061880544405732, 'learning_rate': 1.8095238095238097e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0027076555527271373, 'learning_rate': 1.8059964726631396e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006878144592544199, 'learning_rate': 1.8024691358024693e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 7.3013929343864665e-06, 'learning_rate': 1.798941798941799e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 6.651058475494532e-05, 'learning_rate': 1.795414462081129e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 1.629305189456473e-05, 'learning_rate': 1.7918871252204587e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.2844, 'grad_norm': 22.883751367731566, 'learning_rate': 1.7883597883597886e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 1.0898030992083968e-05, 'learning_rate': 1.7848324514991183e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0029, 'grad_norm': 0.45395672857249786, 'learning_rate': 1.781305114638448e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0023853452724112985, 'learning_rate': 1.777777777777778e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 8.969104823029639e-07, 'learning_rate': 1.7742504409171077e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008562855628749842, 'learning_rate': 1.7707231040564376e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003675753638260854, 'learning_rate': 1.7671957671957673e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007468930725981942, 'learning_rate': 1.763668430335097e-06, 'epoch': 1.68}\n",
+      "{'loss': 0.0032, 'grad_norm': 0.2535838207338805, 'learning_rate': 1.760141093474427e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0024464973251721064, 'learning_rate': 1.7566137566137567e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.019742021398964537, 'learning_rate': 1.7530864197530868e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.03627827400316081, 'learning_rate': 1.7495590828924163e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008937580837258351, 'learning_rate': 1.746031746031746e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004179667755762465, 'learning_rate': 1.7425044091710761e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.058672238471331406, 'learning_rate': 1.7389770723104056e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002482083839169945, 'learning_rate': 1.7354497354497358e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.024780982810705062, 'learning_rate': 1.7319223985890655e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016520204464278378, 'learning_rate': 1.7283950617283952e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00044707814642590965, 'learning_rate': 1.7248677248677251e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0, 'grad_norm': 1.4225758409631986e-05, 'learning_rate': 1.7213403880070548e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0981, 'grad_norm': 5.299050753535939, 'learning_rate': 1.7178130511463848e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.3606017776628453, 'learning_rate': 1.7142857142857145e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011367270438229259, 'learning_rate': 1.7107583774250442e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0045, 'grad_norm': 0.5480397219201286, 'learning_rate': 1.7072310405643741e-06, 'epoch': 1.69}\n",
+      "{'loss': 0.0, 'grad_norm': 1.544580660839443e-06, 'learning_rate': 1.7037037037037038e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0008, 'grad_norm': 0.09050890643232933, 'learning_rate': 1.7001763668430338e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 3.178818654379665e-05, 'learning_rate': 1.6966490299823635e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 8.89986237202205e-06, 'learning_rate': 1.6931216931216932e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 6.8871482596971246e-06, 'learning_rate': 1.689594356261023e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0024422362898427466, 'learning_rate': 1.6860670194003528e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.001, 'grad_norm': 0.12624494662888394, 'learning_rate': 1.6825396825396827e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0035241809506783846, 'learning_rate': 1.6790123456790125e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004633289825665899, 'learning_rate': 1.6754850088183422e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0123, 'grad_norm': 1.7640209507336004, 'learning_rate': 1.671957671957672e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.5522, 'grad_norm': 17.333999229747157, 'learning_rate': 1.6684303350970018e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0325, 'grad_norm': 2.8296969771160665, 'learning_rate': 1.6649029982363317e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00064173043437846, 'learning_rate': 1.6613756613756614e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002211613778558736, 'learning_rate': 1.6578483245149912e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.23773520079339874, 'learning_rate': 1.6543209876543213e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003616905517607784, 'learning_rate': 1.6507936507936508e-06, 'epoch': 1.7}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00048277282356808956, 'learning_rate': 1.647266313932981e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010048097972275034, 'learning_rate': 1.6437389770723106e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 1.2268966527450298e-06, 'learning_rate': 1.6402116402116404e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004367792970605526, 'learning_rate': 1.6366843033509703e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 7.590987029502902e-05, 'learning_rate': 1.6331569664903e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 2.7200482051652248e-05, 'learning_rate': 1.62962962962963e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 1.8573238686293525e-05, 'learning_rate': 1.6261022927689596e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 1.7528995553588954e-05, 'learning_rate': 1.6225749559082893e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.03977620966194727, 'learning_rate': 1.6190476190476193e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0101, 'grad_norm': 1.9554110766572816, 'learning_rate': 1.615520282186949e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 7.037358089467794e-06, 'learning_rate': 1.611992945326279e-06, 'epoch': 1.71}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 86%|████████▌ | 2696/3150 [11:12<01:56,  3.88it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0, 'grad_norm': 0.002211572384720586, 'learning_rate': 1.6084656084656086e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0086, 'grad_norm': 0.68364602025328, 'learning_rate': 1.6049382716049383e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.026921721577691494, 'learning_rate': 1.6014109347442683e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001044867510325982, 'learning_rate': 1.597883597883598e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00025342561809199815, 'learning_rate': 1.5943562610229279e-06, 'epoch': 1.71}\n",
+      "{'loss': 0.0, 'grad_norm': 2.4368396991437897e-05, 'learning_rate': 1.5908289241622576e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 5.531408833197814e-06, 'learning_rate': 1.5873015873015873e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 9.209012136005157e-05, 'learning_rate': 1.5837742504409172e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.015541857089792681, 'learning_rate': 1.580246913580247e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006011956716473188, 'learning_rate': 1.5767195767195769e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008768041872332969, 'learning_rate': 1.5731922398589066e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00020474367265938462, 'learning_rate': 1.5696649029982363e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0018, 'grad_norm': 0.22843636662506034, 'learning_rate': 1.5661375661375664e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007140275423861552, 'learning_rate': 1.562610229276896e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 5.893887256758022e-06, 'learning_rate': 1.559082892416226e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002448551262469679, 'learning_rate': 1.5555555555555558e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007650143542090581, 'learning_rate': 1.5520282186948855e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000680595285974768, 'learning_rate': 1.5485008818342154e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0079, 'grad_norm': 0.8576919373662464, 'learning_rate': 1.5449735449735451e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003081882328994242, 'learning_rate': 1.541446208112875e-06, 'epoch': 1.72}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.22326025268137406, 'learning_rate': 1.5379188712522048e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.08452600933107397, 'learning_rate': 1.5343915343915345e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0, 'grad_norm': 5.8263281235834025e-06, 'learning_rate': 1.5308641975308644e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0, 'grad_norm': 2.8641097824738384e-05, 'learning_rate': 1.5273368606701941e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00023953284712563937, 'learning_rate': 1.523809523809524e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.10318961178474603, 'learning_rate': 1.5202821869488538e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0047201882685617294, 'learning_rate': 1.5167548500881835e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.2201, 'grad_norm': 13.46637239292025, 'learning_rate': 1.5132275132275134e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.031228290780723086, 'learning_rate': 1.5097001763668431e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0, 'grad_norm': 1.2915673334161088e-05, 'learning_rate': 1.506172839506173e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00026499186054424075, 'learning_rate': 1.5026455026455028e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02760296173622346, 'learning_rate': 1.4991181657848325e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001639119874328509, 'learning_rate': 1.4955908289241624e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002642466236169019, 'learning_rate': 1.492063492063492e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01049754403540176, 'learning_rate': 1.4885361552028218e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0, 'grad_norm': 7.549260044073239e-05, 'learning_rate': 1.4850088183421517e-06, 'epoch': 1.73}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.050830771068646474, 'learning_rate': 1.4814814814814815e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0204, 'grad_norm': 2.0886125306801664, 'learning_rate': 1.4779541446208116e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 7.874933377297752e-05, 'learning_rate': 1.474426807760141e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00033736621741511957, 'learning_rate': 1.4708994708994708e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0074, 'grad_norm': 0.8144405007241585, 'learning_rate': 1.467372134038801e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0030701715044873356, 'learning_rate': 1.4638447971781307e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.1327, 'grad_norm': 15.034381881201657, 'learning_rate': 1.4603174603174606e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 6.829819750438748e-05, 'learning_rate': 1.4567901234567903e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 0.006652420254309143, 'learning_rate': 1.45326278659612e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004331160809346781, 'learning_rate': 1.44973544973545e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 5.683233725647659e-05, 'learning_rate': 1.4462081128747796e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.2377, 'grad_norm': 11.043771871772874, 'learning_rate': 1.4426807760141096e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009107596275067593, 'learning_rate': 1.4391534391534393e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 5.805927448242829e-05, 'learning_rate': 1.435626102292769e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016838268368588136, 'learning_rate': 1.432098765432099e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018482000034348296, 'learning_rate': 1.4285714285714286e-06, 'epoch': 1.74}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.04293543163239497, 'learning_rate': 1.4250440917107586e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00015671997745973104, 'learning_rate': 1.4215167548500883e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002146207764222848, 'learning_rate': 1.417989417989418e-06, 'epoch': 1.75}\n",
+      "{'loss': 1.252, 'grad_norm': 27.72300493167013, 'learning_rate': 1.414462081128748e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002552250388097407, 'learning_rate': 1.4109347442680776e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001347098518899094, 'learning_rate': 1.4074074074074075e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0044, 'grad_norm': 0.6019769949763011, 'learning_rate': 1.4038800705467373e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0017302346256344854, 'learning_rate': 1.400352733686067e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0035420531008771993, 'learning_rate': 1.3968253968253969e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0027, 'grad_norm': 0.3719417720441004, 'learning_rate': 1.3932980599647266e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.3171, 'grad_norm': 12.065220659422142, 'learning_rate': 1.3897707231040567e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.024788508667591407, 'learning_rate': 1.3862433862433862e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019248097953165265, 'learning_rate': 1.382716049382716e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004039566505274838, 'learning_rate': 1.379188712522046e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 1.1181118539189842e-06, 'learning_rate': 1.3756613756613758e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 4.025972578790417e-07, 'learning_rate': 1.3721340388007057e-06, 'epoch': 1.75}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00317073921589386, 'learning_rate': 1.3686067019400354e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0, 'grad_norm': 6.20577599257636e-06, 'learning_rate': 1.3650793650793652e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06733579999704083, 'learning_rate': 1.361552028218695e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0, 'grad_norm': 2.07316175760067e-07, 'learning_rate': 1.3580246913580248e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.1733, 'grad_norm': 7.135452366946123, 'learning_rate': 1.3544973544973547e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.2206701321542835, 'learning_rate': 1.3509700176366844e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006435399966684016, 'learning_rate': 1.3474426807760141e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009882830588329272, 'learning_rate': 1.343915343915344e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0, 'grad_norm': 2.975506978323271e-06, 'learning_rate': 1.3403880070546738e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0209, 'grad_norm': 2.3429470467015383, 'learning_rate': 1.3368606701940037e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003592179363099259, 'learning_rate': 1.3333333333333334e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007723002321663679, 'learning_rate': 1.3298059964726631e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01848894710926474, 'learning_rate': 1.326278659611993e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006335717036339369, 'learning_rate': 1.3227513227513228e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0037686510993359106, 'learning_rate': 1.3192239858906527e-06, 'epoch': 1.76}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04624992327506557, 'learning_rate': 1.3156966490299824e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 8.392777144638153e-05, 'learning_rate': 1.3121693121693121e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00044583075808159953, 'learning_rate': 1.308641975308642e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 2.192704413988218e-06, 'learning_rate': 1.3051146384479718e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 6.642638493784298e-05, 'learning_rate': 1.3015873015873019e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 7.4113463796958506e-06, 'learning_rate': 1.2980599647266314e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0011, 'grad_norm': 0.1432114669671816, 'learning_rate': 1.294532627865961e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004176969497779919, 'learning_rate': 1.2910052910052912e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 3.917324970278112e-05, 'learning_rate': 1.287477954144621e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 3.3993425791463455e-06, 'learning_rate': 1.2839506172839509e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0042048165732902956, 'learning_rate': 1.2804232804232806e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 2.8501663153868015e-05, 'learning_rate': 1.2768959435626103e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 4.29597732990206e-06, 'learning_rate': 1.2733686067019402e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002373042482372769, 'learning_rate': 1.26984126984127e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0021873084390146686, 'learning_rate': 1.2663139329805999e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.2588, 'grad_norm': 6.179318963212782, 'learning_rate': 1.2627865961199296e-06, 'epoch': 1.77}\n",
+      "{'loss': 0.0, 'grad_norm': 1.354229571593673e-06, 'learning_rate': 1.2592592592592593e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009644855197677818, 'learning_rate': 1.2557319223985892e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003116933468256328, 'learning_rate': 1.252204585537919e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 4.6134710463368e-06, 'learning_rate': 1.2486772486772486e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003753856235895795, 'learning_rate': 1.2451499118165786e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02089706412486388, 'learning_rate': 1.2416225749559085e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00021344479885947745, 'learning_rate': 1.2380952380952382e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 5.7142984525768496e-05, 'learning_rate': 1.234567901234568e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0025, 'grad_norm': 0.29267618193931505, 'learning_rate': 1.2310405643738978e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0019, 'grad_norm': 0.26363492636172975, 'learning_rate': 1.2275132275132276e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003642296411761232, 'learning_rate': 1.2239858906525575e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006580077429898279, 'learning_rate': 1.2204585537918872e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016092692250581673, 'learning_rate': 1.216931216931217e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012291007534637646, 'learning_rate': 1.2134038800705468e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.028826454453085168, 'learning_rate': 1.2098765432098765e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03249194411298744, 'learning_rate': 1.2063492063492065e-06, 'epoch': 1.78}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.017417518387468998, 'learning_rate': 1.2028218694885364e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00013580253515009232, 'learning_rate': 1.1992945326278659e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0, 'grad_norm': 2.0909304794310134e-05, 'learning_rate': 1.1957671957671958e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005110268963462991, 'learning_rate': 1.1922398589065257e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00801157039981475, 'learning_rate': 1.1887125220458555e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004335488193957374, 'learning_rate': 1.1851851851851854e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0533, 'grad_norm': 5.022127588800236, 'learning_rate': 1.181657848324515e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0024, 'grad_norm': 0.26110115059044264, 'learning_rate': 1.1781305114638448e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0, 'grad_norm': 8.712455337354968e-06, 'learning_rate': 1.1746031746031747e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03798988805917695, 'learning_rate': 1.1710758377425044e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0, 'grad_norm': 7.93437931071683e-05, 'learning_rate': 1.1675485008818344e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.03274210463058044, 'learning_rate': 1.164021164021164e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00152468433409751, 'learning_rate': 1.1604938271604938e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0, 'grad_norm': 1.2508801952944504e-07, 'learning_rate': 1.1569664902998237e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0, 'grad_norm': 1.5108838751115796e-05, 'learning_rate': 1.1534391534391536e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0278, 'grad_norm': 2.900336535427879, 'learning_rate': 1.1499118165784833e-06, 'epoch': 1.79}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05175193717286274, 'learning_rate': 1.146384479717813e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 4.0214890961743556e-05, 'learning_rate': 1.142857142857143e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010062250997397715, 'learning_rate': 1.1393298059964727e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019611686298723794, 'learning_rate': 1.1358024691358026e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.19292418316629523, 'learning_rate': 1.1322751322751323e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.09123469233306612, 'learning_rate': 1.128747795414462e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002106139905969461, 'learning_rate': 1.125220458553792e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002760885730514098, 'learning_rate': 1.1216931216931217e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00014326180032953322, 'learning_rate': 1.1181657848324516e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004750953523686838, 'learning_rate': 1.1146384479717815e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0017, 'grad_norm': 0.18708496053925416, 'learning_rate': 1.111111111111111e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 1.8095478393047578e-06, 'learning_rate': 1.107583774250441e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007982430176973284, 'learning_rate': 1.1040564373897709e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0029, 'grad_norm': 0.3212281491125167, 'learning_rate': 1.1005291005291006e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 5.996702962976852e-06, 'learning_rate': 1.0970017636684305e-06, 'epoch': 1.8}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003011159993330132, 'learning_rate': 1.0934744268077602e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0185, 'grad_norm': 2.2248727912113884, 'learning_rate': 1.08994708994709e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0, 'grad_norm': 2.8048640603599284e-05, 'learning_rate': 1.0864197530864199e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009734906487685746, 'learning_rate': 1.0828924162257496e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0014, 'grad_norm': 0.139163776819078, 'learning_rate': 1.0793650793650795e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.06110805735414777, 'learning_rate': 1.0758377425044092e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0, 'grad_norm': 2.9046019118464493e-06, 'learning_rate': 1.072310405643739e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0198, 'grad_norm': 2.7254712563487713, 'learning_rate': 1.0687830687830689e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0742, 'grad_norm': 4.175767318659525, 'learning_rate': 1.0652557319223988e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00543892109622625, 'learning_rate': 1.0617283950617285e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0024397275227007295, 'learning_rate': 1.0582010582010582e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0, 'grad_norm': 9.354979917194169e-09, 'learning_rate': 1.0546737213403881e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02888266589632095, 'learning_rate': 1.0511463844797178e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002458131209002909, 'learning_rate': 1.0476190476190478e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03146932607592727, 'learning_rate': 1.0440917107583775e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.007975430004269563, 'learning_rate': 1.0405643738977072e-06, 'epoch': 1.81}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0031402272430444337, 'learning_rate': 1.0370370370370371e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005640257748786879, 'learning_rate': 1.0335097001763668e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005228822080113331, 'learning_rate': 1.0299823633156968e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 1.1167006155757997e-05, 'learning_rate': 1.0264550264550267e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 1.6421475992259084e-05, 'learning_rate': 1.0229276895943562e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017179282419972096, 'learning_rate': 1.0194003527336861e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01047617274611188, 'learning_rate': 1.015873015873016e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002617554245571068, 'learning_rate': 1.0123456790123457e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008950301415495877, 'learning_rate': 1.0088183421516757e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0211, 'grad_norm': 1.8515516836116028, 'learning_rate': 1.0052910052910054e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 1.5354085740938843e-05, 'learning_rate': 1.001763668430335e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013136252719634905, 'learning_rate': 9.98236331569665e-07, 'epoch': 1.82}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.015138884070323854, 'learning_rate': 9.947089947089947e-07, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001512503037489941, 'learning_rate': 9.911816578483247e-07, 'epoch': 1.82}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03650811251250313, 'learning_rate': 9.876543209876544e-07, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001724620728698204, 'learning_rate': 9.84126984126984e-07, 'epoch': 1.82}\n",
+      "{'loss': 0.0, 'grad_norm': 5.3623202811862104e-05, 'learning_rate': 9.80599647266314e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 5.938377660144885e-05, 'learning_rate': 9.77072310405644e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03539650829994047, 'learning_rate': 9.735449735449736e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01362723381038537, 'learning_rate': 9.700176366843034e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.02170435695583752, 'learning_rate': 9.664902998236333e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000646161547481818, 'learning_rate': 9.62962962962963e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004044395940076426, 'learning_rate': 9.59435626102293e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015984793505548541, 'learning_rate': 9.559082892416226e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004539839546766922, 'learning_rate': 9.523809523809525e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 1.6159201479501806e-05, 'learning_rate': 9.488536155202823e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010788009961628672, 'learning_rate': 9.453262786596121e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007774529002362729, 'learning_rate': 9.417989417989419e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005679728110622766, 'learning_rate': 9.382716049382717e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0013392319852155708, 'learning_rate': 9.347442680776014e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 4.983701770885585e-05, 'learning_rate': 9.312169312169313e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011072044840184187, 'learning_rate': 9.276895943562611e-07, 'epoch': 1.83}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00013566235166995148, 'learning_rate': 9.241622574955909e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0918, 'grad_norm': 7.5205326037808815, 'learning_rate': 9.206349206349208e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.11087422950857524, 'learning_rate': 9.171075837742504e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0026, 'grad_norm': 0.3988402518914261, 'learning_rate': 9.135802469135802e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0019477546539639016, 'learning_rate': 9.100529100529102e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008947390123494983, 'learning_rate': 9.0652557319224e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010916694862046777, 'learning_rate': 9.029982363315698e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0, 'grad_norm': 9.61061593861633e-05, 'learning_rate': 8.994708994708995e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0, 'grad_norm': 7.191758792420599e-06, 'learning_rate': 8.959435626102293e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0045, 'grad_norm': 4.25442790326239, 'learning_rate': 8.924162257495592e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000996616411078241, 'learning_rate': 8.88888888888889e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017841616735292574, 'learning_rate': 8.853615520282188e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0, 'grad_norm': 3.089755254834021e-08, 'learning_rate': 8.818342151675485e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00017594547073467007, 'learning_rate': 8.783068783068783e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006721895256610928, 'learning_rate': 8.747795414462081e-07, 'epoch': 1.84}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015120291249728966, 'learning_rate': 8.712522045855381e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008474597888645548, 'learning_rate': 8.677248677248679e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010425233983906508, 'learning_rate': 8.641975308641976e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3542894650866333e-05, 'learning_rate': 8.606701940035274e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009197157974584253, 'learning_rate': 8.571428571428572e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00253423908357895, 'learning_rate': 8.536155202821871e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 1.0008493482325089e-05, 'learning_rate': 8.500881834215169e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003718397057159748, 'learning_rate': 8.465608465608466e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001465245250226325, 'learning_rate': 8.430335097001764e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.015658149721050514, 'learning_rate': 8.395061728395062e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012402893766650006, 'learning_rate': 8.35978835978836e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0033642191289496503, 'learning_rate': 8.324514991181659e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0339, 'grad_norm': 3.6744046202554355, 'learning_rate': 8.289241622574956e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0038, 'grad_norm': 0.5441794311093471, 'learning_rate': 8.253968253968254e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004610843024047733, 'learning_rate': 8.218694885361553e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00013129242569294306, 'learning_rate': 8.183421516754851e-07, 'epoch': 1.85}\n",
+      "{'loss': 0.0, 'grad_norm': 4.319876699599572e-05, 'learning_rate': 8.14814814814815e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.018323974454357655, 'learning_rate': 8.112874779541447e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 6.6328584536584246e-06, 'learning_rate': 8.077601410934745e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018823512069560027, 'learning_rate': 8.042328042328043e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005532251146599437, 'learning_rate': 8.007054673721341e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 5.703274363170379e-06, 'learning_rate': 7.971781305114639e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0013, 'grad_norm': 0.18239507580426817, 'learning_rate': 7.936507936507937e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 7.93484035227323e-06, 'learning_rate': 7.901234567901235e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010439400816008328, 'learning_rate': 7.865961199294533e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.039516967282475945, 'learning_rate': 7.830687830687832e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03233101563000936, 'learning_rate': 7.79541446208113e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001329464599384787, 'learning_rate': 7.760141093474428e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 3.063616999944368e-06, 'learning_rate': 7.724867724867726e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014340698172889662, 'learning_rate': 7.689594356261024e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0007866328995341595, 'learning_rate': 7.654320987654322e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0048, 'grad_norm': 0.46627622162461146, 'learning_rate': 7.61904761904762e-07, 'epoch': 1.86}\n",
+      "{'loss': 0.0, 'grad_norm': 3.2824854888544735e-06, 'learning_rate': 7.583774250440917e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006863793256192384, 'learning_rate': 7.548500881834216e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0008268453601300114, 'learning_rate': 7.513227513227514e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0024589801069971854, 'learning_rate': 7.477954144620812e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00026882140172834, 'learning_rate': 7.442680776014109e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004419868284388884, 'learning_rate': 7.407407407407407e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001414840132355681, 'learning_rate': 7.372134038800705e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002180039454875338, 'learning_rate': 7.336860670194005e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 8.521518493531527e-05, 'learning_rate': 7.301587301587303e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 2.1050243910698632e-05, 'learning_rate': 7.2663139329806e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018412234633779777, 'learning_rate': 7.231040564373898e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006558196111335734, 'learning_rate': 7.195767195767196e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.009911552907446183, 'learning_rate': 7.160493827160495e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0024849672082663666, 'learning_rate': 7.125220458553793e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 9.092756448184095e-05, 'learning_rate': 7.08994708994709e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004436141325712003, 'learning_rate': 7.054673721340388e-07, 'epoch': 1.87}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016417010654742173, 'learning_rate': 7.019400352733686e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.020027401150667118, 'learning_rate': 6.984126984126984e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03262111239032489, 'learning_rate': 6.948853615520284e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00138312205109877, 'learning_rate': 6.91358024691358e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0, 'grad_norm': 4.143620675154094e-07, 'learning_rate': 6.878306878306879e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00048091446104581484, 'learning_rate': 6.843033509700177e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0014008486455082307, 'learning_rate': 6.807760141093475e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0, 'grad_norm': 3.5140223736778088e-06, 'learning_rate': 6.772486772486774e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.023704009454032976, 'learning_rate': 6.737213403880071e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006480994118300781, 'learning_rate': 6.701940035273369e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.5947, 'grad_norm': 22.19695482418246, 'learning_rate': 6.666666666666667e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00028367308902580037, 'learning_rate': 6.631393298059965e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0493, 'grad_norm': 3.0678233377157014, 'learning_rate': 6.596119929453263e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0, 'grad_norm': 3.937198531954759e-05, 'learning_rate': 6.560846560846561e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.001, 'grad_norm': 0.12981481851813162, 'learning_rate': 6.525573192239859e-07, 'epoch': 1.88}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000931824726151179, 'learning_rate': 6.490299823633157e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0, 'grad_norm': 4.1076087469488374e-06, 'learning_rate': 6.455026455026456e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0762, 'grad_norm': 6.15755607576349, 'learning_rate': 6.419753086419754e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0007, 'grad_norm': 0.08136850527219583, 'learning_rate': 6.384479717813052e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.144, 'grad_norm': 9.185813697153268, 'learning_rate': 6.34920634920635e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010369735372036655, 'learning_rate': 6.313932980599648e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0, 'grad_norm': 7.170906657584714e-05, 'learning_rate': 6.278659611992946e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.21180759339490057, 'learning_rate': 6.243386243386243e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0154, 'grad_norm': 1.8968492936681618, 'learning_rate': 6.208112874779542e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010693154871617577, 'learning_rate': 6.17283950617284e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009593358373889379, 'learning_rate': 6.137566137566138e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0, 'grad_norm': 0.005423520264046245, 'learning_rate': 6.102292768959436e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011931902707200133, 'learning_rate': 6.067019400352734e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.03432353947016045, 'learning_rate': 6.031746031746032e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.060848628170751165, 'learning_rate': 5.996472663139329e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0434, 'grad_norm': 3.995573097535144, 'learning_rate': 5.961199294532629e-07, 'epoch': 1.89}\n",
+      "{'loss': 0.0195, 'grad_norm': 3.337184128118834, 'learning_rate': 5.925925925925927e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 1.3346930133281044e-05, 'learning_rate': 5.890652557319224e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001473744338507368, 'learning_rate': 5.855379188712522e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 3.873381448388782e-06, 'learning_rate': 5.82010582010582e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.017951695399234854, 'learning_rate': 5.784832451499119e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.019404340236655687, 'learning_rate': 5.749559082892417e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 5.123999525867746e-06, 'learning_rate': 5.714285714285715e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 3.511301099363453e-05, 'learning_rate': 5.679012345679013e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 2.046743990479607e-05, 'learning_rate': 5.64373897707231e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.013190673763733582, 'learning_rate': 5.608465608465608e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 2.7010975135940436e-07, 'learning_rate': 5.573192239858908e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002546070566202859, 'learning_rate': 5.537918871252205e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.018173035205145027, 'learning_rate': 5.502645502645503e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004747228238415261, 'learning_rate': 5.467372134038801e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0065, 'grad_norm': 0.7283245173501952, 'learning_rate': 5.432098765432099e-07, 'epoch': 1.9}\n",
+      "{'loss': 0.0, 'grad_norm': 3.2371688097277066e-05, 'learning_rate': 5.396825396825398e-07, 'epoch': 1.9}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 95%|█████████▌| 3000/3150 [12:29<00:34,  4.40it/s]12/23/2024 06:47:36 - INFO - FlagEmbedding.finetune.embedder.encoder_only.base.trainer -   Saving model checkpoint to ./test_encoder_only_base_bge-large-en-v1.5/checkpoint-3000\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0684, 'grad_norm': 7.017930091803245, 'learning_rate': 5.361552028218695e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000511339045166218, 'learning_rate': 5.326278659611994e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005950045300509824, 'learning_rate': 5.291005291005291e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003924217893271839, 'learning_rate': 5.255731922398589e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004307797663678439, 'learning_rate': 5.220458553791887e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 2.603897655768594e-05, 'learning_rate': 5.185185185185186e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 5.483371389463175e-06, 'learning_rate': 5.149911816578484e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.4844, 'grad_norm': 20.3684123804579, 'learning_rate': 5.114638447971781e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.01247325742979955, 'learning_rate': 5.07936507936508e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0004, 'grad_norm': 0.05214513400326202, 'learning_rate': 5.044091710758378e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0043, 'grad_norm': 0.39754653871157175, 'learning_rate': 5.008818342151675e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011053969847313325, 'learning_rate': 4.973544973544974e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 4.441425509402899e-06, 'learning_rate': 4.938271604938272e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 4.610890750135553e-05, 'learning_rate': 4.90299823633157e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 3.9493094037863396e-05, 'learning_rate': 4.867724867724868e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00039020145079620436, 'learning_rate': 4.832451499118166e-07, 'epoch': 1.91}\n",
+      "{'loss': 0.0144, 'grad_norm': 1.88804343023699, 'learning_rate': 4.797178130511465e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003097977659349139, 'learning_rate': 4.7619047619047623e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003778866036213471, 'learning_rate': 4.7266313932980605e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0, 'grad_norm': 5.828519628154781e-05, 'learning_rate': 4.6913580246913586e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.044727839322584735, 'learning_rate': 4.6560846560846563e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0005428033173977267, 'learning_rate': 4.6208112874779545e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0, 'grad_norm': 4.7212207596064364e-05, 'learning_rate': 4.585537918871252e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.05847935437065705, 'learning_rate': 4.550264550264551e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011699597532878268, 'learning_rate': 4.514991181657849e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.01870368982276248, 'learning_rate': 4.4797178130511467e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.015383468039619994, 'learning_rate': 4.444444444444445e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011209545661644016, 'learning_rate': 4.4091710758377425e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003868200838857486, 'learning_rate': 4.3738977072310407e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0, 'grad_norm': 2.60775052300705e-06, 'learning_rate': 4.3386243386243395e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.0, 'grad_norm': 7.593150131535777e-05, 'learning_rate': 4.303350970017637e-07, 'epoch': 1.92}\n",
+      "{'loss': 0.068, 'grad_norm': 5.66971969638243, 'learning_rate': 4.2680776014109353e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 1.1517619156772149e-06, 'learning_rate': 4.232804232804233e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00016261318716400273, 'learning_rate': 4.197530864197531e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 9.934423908518212e-05, 'learning_rate': 4.1622574955908293e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 4.630650865015419e-05, 'learning_rate': 4.126984126984127e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002713659199822411, 'learning_rate': 4.0917107583774257e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 9.59855588838112e-05, 'learning_rate': 4.0564373897707234e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00025703915471810755, 'learning_rate': 4.0211640211640215e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008073813296251653, 'learning_rate': 3.9858906525573197e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0003134547688797712, 'learning_rate': 3.9506172839506174e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003652804569021769, 'learning_rate': 3.915343915343916e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0004813861229114684, 'learning_rate': 3.880070546737214e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.006043893355093694, 'learning_rate': 3.844797178130512e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0161769819762257, 'learning_rate': 3.80952380952381e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.06428789504253596, 'learning_rate': 3.774250440917108e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016096208795644844, 'learning_rate': 3.738977072310406e-07, 'epoch': 1.93}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.006103003897672634, 'learning_rate': 3.7037037037037036e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0005, 'grad_norm': 0.07220813036353753, 'learning_rate': 3.6684303350970024e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 7.71015772480006e-05, 'learning_rate': 3.6331569664903e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 2.5767988910071672e-05, 'learning_rate': 3.597883597883598e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0021044919371881357, 'learning_rate': 3.5626102292768964e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.0067619915429907, 'learning_rate': 3.527336860670194e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006362331901673785, 'learning_rate': 3.492063492063492e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 0.002150033841525042, 'learning_rate': 3.45679012345679e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 2.047805190423887e-05, 'learning_rate': 3.4215167548500886e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 4.025536354708105e-05, 'learning_rate': 3.386243386243387e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 1.4238546860667142e-05, 'learning_rate': 3.3509700176366844e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 1.937328256576442e-06, 'learning_rate': 3.3156966490299826e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 3.171681619058705e-05, 'learning_rate': 3.2804232804232803e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.023320799104447645, 'learning_rate': 3.2451499118165785e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 7.934331823314003e-06, 'learning_rate': 3.209876543209877e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00799464991734567, 'learning_rate': 3.174603174603175e-07, 'epoch': 1.94}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012578571984434596, 'learning_rate': 3.139329805996473e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 7.72411087198512e-05, 'learning_rate': 3.104056437389771e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0002, 'grad_norm': 0.022714740994025026, 'learning_rate': 3.068783068783069e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0015504349750055965, 'learning_rate': 3.033509700176367e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018021214917382257, 'learning_rate': 2.9982363315696647e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004113905169376137, 'learning_rate': 2.9629629629629634e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.2016581149609865, 'learning_rate': 2.927689594356261e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018449593141463475, 'learning_rate': 2.8924162257495593e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.008245391589446278, 'learning_rate': 2.8571428571428575e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0323, 'grad_norm': 3.780273689776535, 'learning_rate': 2.821869488536155e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.00852297706812535, 'learning_rate': 2.786596119929454e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0018313366702628945, 'learning_rate': 2.7513227513227515e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011691011588432173, 'learning_rate': 2.7160493827160497e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0016125895986981674, 'learning_rate': 2.6807760141093473e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 8.771859359620397e-05, 'learning_rate': 2.6455026455026455e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 4.0505952256235e-06, 'learning_rate': 2.6102292768959437e-07, 'epoch': 1.95}\n",
+      "{'loss': 0.0, 'grad_norm': 2.8934357866479634e-06, 'learning_rate': 2.574955908289242e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0, 'grad_norm': 3.0393639722872672e-05, 'learning_rate': 2.53968253968254e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0001197429164920675, 'learning_rate': 2.504409171075838e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.033777332957387246, 'learning_rate': 2.469135802469136e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.131, 'grad_norm': 6.288003052747146, 'learning_rate': 2.433862433862434e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.012765237913542997, 'learning_rate': 2.3985890652557323e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00025312920024049074, 'learning_rate': 2.3633156966490302e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0157, 'grad_norm': 1.3197526817113876, 'learning_rate': 2.3280423280423281e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.08167493443773247, 'learning_rate': 2.292768959435626e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.8901, 'grad_norm': 35.861065115399704, 'learning_rate': 2.2574955908289245e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0, 'grad_norm': 0.001171426253310791, 'learning_rate': 2.2222222222222224e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0, 'grad_norm': 0.000942317606516971, 'learning_rate': 2.1869488536155204e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0009, 'grad_norm': 0.12471578368657384, 'learning_rate': 2.1516754850088186e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0, 'grad_norm': 2.4370120235734013e-05, 'learning_rate': 2.1164021164021165e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003842534065516292, 'learning_rate': 2.0811287477954147e-07, 'epoch': 1.96}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0002117079538596778, 'learning_rate': 2.0458553791887128e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004507337231824404, 'learning_rate': 2.0105820105820108e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0123, 'grad_norm': 2.0972165689399205, 'learning_rate': 1.9753086419753087e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009499857021245386, 'learning_rate': 1.940035273368607e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011007416869556516, 'learning_rate': 1.904761904761905e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00031314452693028853, 'learning_rate': 1.869488536155203e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0406, 'grad_norm': 3.133294808035934, 'learning_rate': 1.8342151675485012e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00012721737014292995, 'learning_rate': 1.798941798941799e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0012331950656217505, 'learning_rate': 1.763668430335097e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00019371085895894088, 'learning_rate': 1.728395061728395e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006876610573984653, 'learning_rate': 1.6931216931216934e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003043942876051606, 'learning_rate': 1.6578483245149913e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 5.570820710691136e-06, 'learning_rate': 1.6225749559082892e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0084, 'grad_norm': 0.9241918771146999, 'learning_rate': 1.5873015873015874e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.010668706113787582, 'learning_rate': 1.5520282186948856e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 9.937013161407782e-05, 'learning_rate': 1.5167548500881835e-07, 'epoch': 1.97}\n",
+      "{'loss': 0.0, 'grad_norm': 1.0460151476522579e-05, 'learning_rate': 1.4814814814814817e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 3.4911135821302845e-05, 'learning_rate': 1.4462081128747796e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 6.265151004740304e-05, 'learning_rate': 1.4109347442680776e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00022453282503102275, 'learning_rate': 1.3756613756613757e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 7.574338067568745e-05, 'learning_rate': 1.3403880070546737e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 6.268764998486763e-07, 'learning_rate': 1.3051146384479719e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 5.10557255367733e-07, 'learning_rate': 1.26984126984127e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 2.8119808849843654e-06, 'learning_rate': 1.234567901234568e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.02859943745838939, 'learning_rate': 1.1992945326278662e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 4.0597290030787186e-05, 'learning_rate': 1.1640211640211641e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0010799397916091836, 'learning_rate': 1.1287477954144623e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0015, 'grad_norm': 0.2170854829735469, 'learning_rate': 1.0934744268077602e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0024801494725132617, 'learning_rate': 1.0582010582010582e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 3.4255744029830784e-05, 'learning_rate': 1.0229276895943564e-07, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00039512727798635556, 'learning_rate': 9.876543209876543e-08, 'epoch': 1.98}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0006474007929400392, 'learning_rate': 9.523809523809525e-08, 'epoch': 1.98}\n",
+      "{'loss': 0.4368, 'grad_norm': 20.33885384750277, 'learning_rate': 9.171075837742506e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0006, 'grad_norm': 0.07245427394056331, 'learning_rate': 8.818342151675485e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 9.34553454968883e-05, 'learning_rate': 8.465608465608467e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0011323300301301217, 'learning_rate': 8.112874779541446e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0016, 'grad_norm': 0.19478936708252317, 'learning_rate': 7.760141093474428e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0049, 'grad_norm': 0.7588467177158765, 'learning_rate': 7.407407407407409e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0009042426237441818, 'learning_rate': 7.054673721340388e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 5.4026680880987734e-05, 'learning_rate': 6.701940035273368e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 4.146444168986008e-05, 'learning_rate': 6.34920634920635e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 5.293171053988699e-07, 'learning_rate': 5.996472663139331e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0262, 'grad_norm': 3.837932370956164, 'learning_rate': 5.643738977072311e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.005415344191255889, 'learning_rate': 5.291005291005291e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0032148833654706685, 'learning_rate': 4.938271604938272e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 1.4026965123685529e-05, 'learning_rate': 4.585537918871253e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.0026195195869396066, 'learning_rate': 4.2328042328042335e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.004420981716754515, 'learning_rate': 3.880070546737214e-08, 'epoch': 1.99}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00011741647931916769, 'learning_rate': 3.527336860670194e-08, 'epoch': 2.0}\n",
+      "{'loss': 0.0, 'grad_norm': 0.003142961724610982, 'learning_rate': 3.174603174603175e-08, 'epoch': 2.0}\n",
+      "{'loss': 0.0, 'grad_norm': 0.00018218223784848898, 'learning_rate': 2.8218694885361557e-08, 'epoch': 2.0}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.04631983050335103, 'learning_rate': 2.469135802469136e-08, 'epoch': 2.0}\n",
+      "{'loss': 0.0, 'grad_norm': 5.1132348345306354e-05, 'learning_rate': 2.1164021164021167e-08, 'epoch': 2.0}\n",
+      "{'loss': 0.0003, 'grad_norm': 0.03240276881060448, 'learning_rate': 1.763668430335097e-08, 'epoch': 2.0}\n",
+      "{'loss': 0.0001, 'grad_norm': 0.011975699374629016, 'learning_rate': 1.4109347442680778e-08, 'epoch': 2.0}\n",
+      "{'loss': 0.0967, 'grad_norm': 8.654642224335447, 'learning_rate': 1.0582010582010584e-08, 'epoch': 2.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 3150/3150 [13:10<00:00,  3.72it/s]12/23/2024 06:48:17 - INFO - FlagEmbedding.finetune.embedder.encoder_only.base.trainer -   Saving model checkpoint to ./test_encoder_only_base_bge-large-en-v1.5/checkpoint-3150\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'train_runtime': 799.0537, 'train_samples_per_second': 15.769, 'train_steps_per_second': 3.942, 'train_loss': 0.04348497095562163, 'epoch': 2.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 3150/3150 [13:19<00:00,  3.94it/s]\n",
+      "12/23/2024 06:48:26 - INFO - FlagEmbedding.finetune.embedder.encoder_only.base.trainer -   Saving model checkpoint to ./test_encoder_only_base_bge-large-en-v1.5\n",
+      "[rank0]:[W1223 06:48:28.948814944 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "torchrun --nproc_per_node 2 \\\n",
+    "\t-m FlagEmbedding.finetune.embedder.encoder_only.base \\\n",
+    "\t--model_name_or_path BAAI/bge-large-en-v1.5 \\\n",
+    "    --cache_dir ./cache/model \\\n",
+    "    --train_data ./ft_data/training.json \\\n",
+    "    --cache_path ./cache/data \\\n",
+    "    --train_group_size 8 \\\n",
+    "    --query_max_len 512 \\\n",
+    "    --passage_max_len 512 \\\n",
+    "    --pad_to_multiple_of 8 \\\n",
+    "    --query_instruction_for_retrieval 'Represent this sentence for searching relevant passages: ' \\\n",
+    "    --query_instruction_format '{}{}' \\\n",
+    "    --knowledge_distillation False \\\n",
+    "\t--output_dir ./test_encoder_only_base_bge-large-en-v1.5 \\\n",
+    "    --overwrite_output_dir \\\n",
+    "    --learning_rate 1e-5 \\\n",
+    "    --fp16 \\\n",
+    "    --num_train_epochs 2 \\\n",
+    "    --per_device_train_batch_size 2 \\\n",
+    "    --dataloader_drop_last True \\\n",
+    "    --warmup_ratio 0.1 \\\n",
+    "    --gradient_checkpointing \\\n",
+    "    --deepspeed config/ds_stage0.json \\\n",
+    "    --logging_steps 1 \\\n",
+    "    --save_steps 1000 \\\n",
+    "    --negatives_cross_device \\\n",
+    "    --temperature 0.02 \\\n",
+    "    --sentence_pooling_method cls \\\n",
+    "    --normalize_embeddings True \\\n",
+    "    --kd_loss_type kl_div"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ft",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/7_Finetuning/7.1.3.ipynb b/docs/source/tutorial/7_Finetuning/7.1.3.ipynb
new file mode 100644
index 00000000..b75f3100
--- /dev/null
+++ b/docs/source/tutorial/7_Finetuning/7.1.3.ipynb
@@ -0,0 +1,299 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate the Fine-tuned Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the previous sections, we prepared the dataset and fine-tuned the model. In this tutorial, we will go through how to evaluate the model with the test dataset we constructed."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Installation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "% pip install -U datasets pytrec_eval FlagEmbedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We first load data from the files we processed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "queries = load_dataset(\"json\", data_files=\"ft_data/test_queries.jsonl\")[\"train\"]\n",
+    "corpus = load_dataset(\"json\", data_files=\"ft_data/corpus.jsonl\")[\"train\"]\n",
+    "qrels = load_dataset(\"json\", data_files=\"ft_data/test_qrels.jsonl\")[\"train\"]\n",
+    "\n",
+    "queries_text = queries[\"text\"]\n",
+    "corpus_text = [text for sub in corpus[\"text\"] for text in sub]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qrels_dict = {}\n",
+    "for line in qrels:\n",
+    "    if line['qid'] not in qrels_dict:\n",
+    "        qrels_dict[line['qid']] = {}\n",
+    "    qrels_dict[line['qid']][line['docid']] = line['relevance']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Search"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we prepare a function to encode the text into embeddings and search the results:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import faiss\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "\n",
+    "def search(model, queries_text, corpus_text):\n",
+    "    \n",
+    "    queries_embeddings = model.encode_queries(queries_text)\n",
+    "    corpus_embeddings = model.encode_corpus(corpus_text)\n",
+    "    \n",
+    "    # create and store the embeddings in a Faiss index\n",
+    "    dim = corpus_embeddings.shape[-1]\n",
+    "    index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n",
+    "    corpus_embeddings = corpus_embeddings.astype(np.float32)\n",
+    "    index.train(corpus_embeddings)\n",
+    "    index.add(corpus_embeddings)\n",
+    "    \n",
+    "    query_size = len(queries_embeddings)\n",
+    "\n",
+    "    all_scores = []\n",
+    "    all_indices = []\n",
+    "\n",
+    "    # search top 100 answers for all the queries\n",
+    "    for i in tqdm(range(0, query_size, 32), desc=\"Searching\"):\n",
+    "        j = min(i + 32, query_size)\n",
+    "        query_embedding = queries_embeddings[i: j]\n",
+    "        score, indice = index.search(query_embedding.astype(np.float32), k=100)\n",
+    "        all_scores.append(score)\n",
+    "        all_indices.append(indice)\n",
+    "\n",
+    "    all_scores = np.concatenate(all_scores, axis=0)\n",
+    "    all_indices = np.concatenate(all_indices, axis=0)\n",
+    "    \n",
+    "    # store the results into the format for evaluation\n",
+    "    results = {}\n",
+    "    for idx, (scores, indices) in enumerate(zip(all_scores, all_indices)):\n",
+    "        results[queries[\"id\"][idx]] = {}\n",
+    "        for score, index in zip(scores, indices):\n",
+    "            if index != -1:\n",
+    "                results[queries[\"id\"][idx]][corpus[\"id\"][index]] = float(score)\n",
+    "                \n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from FlagEmbedding.abc.evaluation.utils import evaluate_metrics, evaluate_mrr\n",
+    "from FlagEmbedding import FlagModel\n",
+    "\n",
+    "k_values = [10,100]\n",
+    "\n",
+    "raw_name = \"BAAI/bge-large-en-v1.5\"\n",
+    "finetuned_path = \"test_encoder_only_base_bge-large-en-v1.5\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The result for the original model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "pre tokenize: 100%|██████████| 3/3 [00:00<00:00, 129.75it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "Inference Embeddings: 100%|██████████| 3/3 [00:00<00:00, 11.08it/s]\n",
+      "pre tokenize: 100%|██████████| 28/28 [00:00<00:00, 164.29it/s]\n",
+      "Inference Embeddings: 100%|██████████| 28/28 [00:04<00:00,  6.09it/s]\n",
+      "Searching: 100%|██████████| 22/22 [00:08<00:00,  2.56it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "defaultdict(<class 'list'>, {'NDCG@10': 0.70405, 'NDCG@100': 0.73528})\n",
+      "defaultdict(<class 'list'>, {'MAP@10': 0.666, 'MAP@100': 0.67213})\n",
+      "defaultdict(<class 'list'>, {'Recall@10': 0.82286, 'Recall@100': 0.97286})\n",
+      "defaultdict(<class 'list'>, {'P@10': 0.08229, 'P@100': 0.00973})\n",
+      "defaultdict(<class 'list'>, {'MRR@10': 0.666, 'MRR@100': 0.67213})\n"
+     ]
+    }
+   ],
+   "source": [
+    "raw_model = FlagModel(\n",
+    "    raw_name, \n",
+    "    query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",
+    "    devices=[0],\n",
+    "    use_fp16=False\n",
+    ")\n",
+    "\n",
+    "results = search(raw_model, queries_text, corpus_text)\n",
+    "\n",
+    "eval_res = evaluate_metrics(qrels_dict, results, k_values)\n",
+    "mrr = evaluate_mrr(qrels_dict, results, k_values)\n",
+    "\n",
+    "for res in eval_res:\n",
+    "    print(res)\n",
+    "print(mrr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then the result for the model after fine-tuning:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "pre tokenize: 100%|██████████| 3/3 [00:00<00:00, 164.72it/s]\n",
+      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "Inference Embeddings: 100%|██████████| 3/3 [00:00<00:00,  9.45it/s]\n",
+      "pre tokenize: 100%|██████████| 28/28 [00:00<00:00, 160.19it/s]\n",
+      "Inference Embeddings: 100%|██████████| 28/28 [00:04<00:00,  6.06it/s]\n",
+      "Searching: 100%|██████████| 22/22 [00:07<00:00,  2.80it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "defaultdict(<class 'list'>, {'NDCG@10': 0.84392, 'NDCG@100': 0.85792})\n",
+      "defaultdict(<class 'list'>, {'MAP@10': 0.81562, 'MAP@100': 0.81875})\n",
+      "defaultdict(<class 'list'>, {'Recall@10': 0.93143, 'Recall@100': 0.99429})\n",
+      "defaultdict(<class 'list'>, {'P@10': 0.09314, 'P@100': 0.00994})\n",
+      "defaultdict(<class 'list'>, {'MRR@10': 0.81562, 'MRR@100': 0.81875})\n"
+     ]
+    }
+   ],
+   "source": [
+    "ft_model = FlagModel(\n",
+    "    finetuned_path, \n",
+    "    query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",
+    "    devices=[0],\n",
+    "    use_fp16=False\n",
+    ")\n",
+    "\n",
+    "results = search(ft_model, queries_text, corpus_text)\n",
+    "\n",
+    "eval_res = evaluate_metrics(qrels_dict, results, k_values)\n",
+    "mrr = evaluate_mrr(qrels_dict, results, k_values)\n",
+    "\n",
+    "for res in eval_res:\n",
+    "    print(res)\n",
+    "print(mrr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see an obvious improvement in all the metrics."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ft",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/source/tutorial/index.rst b/docs/source/tutorial/index.rst
index 96ebb4ec..4e0c5782 100644
--- a/docs/source/tutorial/index.rst
+++ b/docs/source/tutorial/index.rst
@@ -6,7 +6,6 @@ In this section, we provide hands on introduction to different topics that highl
 To run the tutorials, clone the GitHub repo and check the `Tutorials <https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials>`_ folder.
 
 .. toctree::
-   :hidden:
    :maxdepth: 1
    :caption: Tutorials
 
@@ -15,4 +14,5 @@ To run the tutorials, clone the GitHub repo and check the `Tutorials <https://gi
    3_Indexing
    4_Evaluation
    5_Reranking
-   6_RAG
\ No newline at end of file
+   6_RAG
+   7_Finetuning
\ No newline at end of file