From 6c8f18bd482b46e60ae8937c8677f2efe926c258 Mon Sep 17 00:00:00 2001
From: Derrick Mwiti <mwitiderrick@gmail.com>
Date: Fri, 3 Nov 2023 12:39:51 +0300
Subject: [PATCH 1/8] add generate notebook

---
 notebooks/generate-text/generate.ipynb | 3849 ++++++++++++++++++++++++
 1 file changed, 3849 insertions(+)
 create mode 100644 notebooks/generate-text/generate.ipynb

diff --git a/notebooks/generate-text/generate.ipynb b/notebooks/generate-text/generate.ipynb
new file mode 100644
index 0000000..43269f9
--- /dev/null
+++ b/notebooks/generate-text/generate.ipynb
@@ -0,0 +1,3849 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eOdUzzSnPeg6"
+      },
+      "source": [
+        "# How to Generate Text on CPUs Using Different Decoding Strategies for Language Models With DeepSparse\n",
+        "\n",
+        "This notebook walks through different strategies for generating text using DeepSparse on CPUs.Read the accompanying blog post on the [Neural Magic website](https://neuralmagic.com/blog/). "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-PbqWnceB5bl"
+      },
+      "outputs": [],
+      "source": [
+        "pip install deepsparse-nightly[llm] langchain sentence-transformers chromadb datasets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "referenced_widgets": [
+            "f3d230a744b34495b0a841599e9b4894"
+          ]
+        },
+        "id": "hk_pZ22NB5bn",
+        "outputId": "bd169fdd-15e1-4e1d-86df-9a48057215c2"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "f3d230a744b34495b0a841599e9b4894",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "2023-11-01 14:32:48 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.6.0.20231031 COMMUNITY | (1af7b0be) (release) (optimized) (system=avx2, binary=avx2)\n",
+            "[7fc54cd80640 >WARN<  operator() ./src/include/wand/utility/warnings.hpp:14] Generating emulated code for quantized (INT8) operations since no VNNI instructions were detected. Set NM_FAST_VNNI_EMULATION=1 to increase performance at the expense of accuracy.\n"
+          ]
+        }
+      ],
+      "source": [
+        "from deepsparse import TextGeneration\n",
+        "\n",
+        "MODEL_PATH = \"hf:neuralmagic/mpt-7b-chat-pruned50-quant\"\n",
+        "\n",
+        "text_pipeline = TextGeneration(model_path=MODEL_PATH, sequence_length=2048)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AJiru1lAPehA"
+      },
+      "outputs": [],
+      "source": [
+        "from langchain.llms import DeepSparse\n",
+        "from langchain.embeddings import HuggingFaceEmbeddings\n",
+        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+        "from langchain.vectorstores import Chroma\n",
+        "from langchain.chains import RetrievalQA\n",
+        "from langchain.document_loaders import TextLoader, DirectoryLoader"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "P-IEi3uzPehB"
+      },
+      "outputs": [],
+      "source": [
+        "DATA_PATH = \"docs\"\n",
+        "\n",
+        "loader = DirectoryLoader(DATA_PATH, glob=\"*.txt\", loader_cls=TextLoader)\n",
+        "documents = loader.load()\n",
+        "\n",
+        "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)\n",
+        "\n",
+        "texts = text_splitter.split_documents(documents)\n",
+        "embeddings = HuggingFaceEmbeddings(\n",
+        "    model_name=\"sentence-transformers/all-MiniLM-L6-v2\", model_kwargs={\"device\": \"cpu\"}\n",
+        ")\n",
+        "\n",
+        "docsearch = Chroma.from_documents(texts, embeddings)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9Go7vQYJPehC"
+      },
+      "source": [
+        "## Temperature"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jLrj1I0IB5bp"
+      },
+      "source": [
+        "The temperature to use when sampling from the probability distribution computed from the logits. Higher values will result in more random samples. Should be greater than 0.0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CpE19G4uPehD"
+      },
+      "source": [
+        "### Summarization"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Wb0ce9yZPehD"
+      },
+      "source": [
+        "The best summary was obtained with the temperature of 0.1"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WCuajSScPehD",
+        "outputId": "8e4a6b05-241b-475c-c591-e26625c05b7b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in pruning models without accuracy loss\n",
+            "The difficulty in handling non-differential quantization\n",
+            "The researchers’ solution was to use distillation loss to achieve high sparsity levels.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0KpqgoRgPehE",
+        "outputId": "1667da50-b036-4b42-a532-ed997857651a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in pruning models without accuracy loss\n",
+            "The difficulty in handling non-differential quantizations\n",
+            "The researchers’ solution was to use distillation loss instead of loss-based methods. They also pruned MPT with 75% sparsity without accuracy loss, showing performance that is on par with quantization approaches\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.1, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "1MNj-HvWPehE",
+        "outputId": "60c131bc-930a-4af7-f10a-e965d7397957"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in handling errors introduced by pruning during training\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.2, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ny0o3NNrPehE",
+        "outputId": "b2a78ced-81b9-4093-f1a2-5970ca857836"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in pruning larger models without sacrificing accuracy or performance\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.3, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5kWLn404PehF",
+        "outputId": "80039b96-02c4-4edf-e81e-29482a0a2349"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in pruning techniques or quantizing weights while preserving performance\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.7, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RCMD8JzmPehF",
+        "outputId": "197715cc-3deb-476a-9b54-108840f3f7f6"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in pruning and reweighing models\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.4, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-mcSkP9qPehF",
+        "outputId": "95511d53-e213-42f7-be79-cc9306f8775f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "To achieve state-of Capabilities with fewer parameters than GPT2\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.5, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mF2jacUNPehF",
+        "outputId": "b5a797b8-cfba-40c3-b3c7-d0091618273c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Tackling both tasks simultaneously posed challenges\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.6, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "1NGiOtHDPehF",
+        "outputId": "691734a6-d149-4fa5-ffb9-80f408a8bda2"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Achieving higher quality gains vs lower quality settings\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.7, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FGNfAYC3PehF",
+        "outputId": "adf62f7c-82ba-466d-90eb-833757db07cb"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "And finally achieving both higher sparsities and improvements in accuracy\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.9, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "k2h08iOlPehG",
+        "outputId": "0d3dec2b-f697-447d-cae3-4db8d6d4781e"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The computational complexity associated with handling very dense representations when doing inference\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.8, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5qc7AOoAPehG"
+      },
+      "source": [
+        "### Creative Writing"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DCPyZjSqPehG"
+      },
+      "source": [
+        "There is repetition of the phrase `As the character` with the default temperature of 1.0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3QlY-OeuPehG",
+        "outputId": "147695aa-fe88-4c1a-c6ee-4afd114c9a90"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\n",
+            "The package arrived in the mail one morning, addressed to the character in an unfamiliar handwriting. The package was wrapped in black tape and sealed with a strange symbol etched into the lid. As the character opened the package, they found themselves staring at a strange crystal, unlike anything they’d ever seen before.\n",
+            "The crystal was translucent, and as the character held it in their hand, they could see the faint outlines of whatever was inside. The crystal seemed to pulse and glow, and the character felt a strange sensation in their chest.\n",
+            "As the character continued to examine the crystal, they realized that it was a portal to another dimension. The crystal was a gateway to a realm of infinite possibilities, and the character was suddenly filled with excitement and wonder.\n",
+            "The character spent the next few days exploring the new realm, discovering new worlds and meeting new beings. The experience was exhilarating, and the character felt alive and invigorated.\n",
+            "As the character continued to explore the new realm, they realized that the crystal was a portal to a realm of infinite possibilities. The world was full of wonder and excitement, and the character felt alive and invigor in the new realm.\n",
+            "As the character continued to explore the new realm, they realized that the crystal was a portal to a realm of infinite possibilities. The world was full of wonder\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"max_new_tokens\": 300}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HSfviKg3PehG"
+      },
+      "source": [
+        "At `temperature=0.6` the model doesn't generate the story but offers some ideas about how the story could be written"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SSQKjV7dPehG",
+        "outputId": "9580f2a8-133c-43fd-e74b-784c64ce0695"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "The answer to this question depends on the individual and their experiences. However, one possible answer could be that the object is a powerful artifact that grants the character immense power and abilities that they never had before. This would drastically change their life, granting them new opportunities and abilities to achieve their goals. On the other hand, if the object is something negative like a curse or curse-like effect, it could have disastrous consequences for the character’s life. Overall, without more information about the specific character and situation, there are many possibilities for how this object could change their life.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.6, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DAfjWkdyPehG"
+      },
+      "source": [
+        "At a temperature of 0.8 the models writes a compeling story and doesn't repeat"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zMCLJR2_PehG",
+        "outputId": "33b4e293-5a2d-45cf-caae-6129bde9e0af"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "The mysterious object is an ancient talisman that has the power to grant wishes. The character spends time pondering over what they would wish for, eventually settling on the desire for world peace. They hold a ceremony where they pour a glass of milk into a bowl, symbolizing their wish for peace in our world. After this ceremony, things seem to change in the world: conflicts seem to be resolved and misunderstand at workplaces was considerably reduced due and people seemed happier and more content with themselves as if these positive changes were brought about by magic.\n",
+            "While this seems like a happy ending, it seems that there is still much work to be done in achieving real peace on earth. The talisman only grants wishes but does not solve underlying issues causing conflict or unhappiness. However, as the character realizes and acknowledges this fact while also understanding that there are still challenges ahead of them, they feel motivated to continue working towards resolving world conflict and creating lasting peace on earth.\n",
+            "As more time passes after this event, many positive changes have been brought about through peaceful efforts towards resolving conflict and creating lasting peace on Earth. The talisman’s power has inspired many who seek peaceful resolutions instead of resorting to violence against those who disagree with them or different ways of living life. Through peaceful efforts towards resolving conflict through negotiation and understanding, lasting peace has been achieved in significant parts of our planet due to those valuing these ideals rather than resorting to violence against others or different ways of living life val\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.8, \"do_sample\": True, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Fs7W9AtSPehG"
+      },
+      "source": [
+        "### RAG"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qpbvU1-TPehG",
+        "outputId": "44a9385e-e4d6-4ecb-ee86-a38c630d5c43"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 134432.82it/s]\n",
+            "2023-10-26 06:06:58 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:06:58 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:07:14 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        }
+      ],
+      "source": [
+        "llm = DeepSparse(model=MODEL_PATH)\n",
+        "\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UYfkISmsPehG"
+      },
+      "outputs": [],
+      "source": [
+        "answer = res[\"result\"]\n",
+        "source_documents = res[\"source_documents\"]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6FEBO_92PehG",
+        "outputId": "81fe121a-fa9c-4183-f3a0-106363576d15"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "' Beschuit, pannenkoeken, and ontbijtkoeken.'"
+            ]
+          },
+          "execution_count": 22,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "answer"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pz78u-NRPehG",
+        "outputId": "8262217e-1353-437e-c927-1be5afdbddf4"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 134432.82it/s]\n",
+            "2023-10-26 06:07:40 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:07:40 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:07:56 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit (Dutch crisp bakes) is also eaten as a breakfast food, with the same variety of sweet topp in the Netherlands is to serve strawberries on beschuit\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.1, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jEP_YTN-PehH",
+        "outputId": "b20fc247-b17d-4734-b1b0-e18459787a48"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 126716.13it/s]\n",
+            "2023-10-26 06:08:23 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:08:23 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:08:35 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit; pancake; toast; French toast; scrambled eggs; yogurt; fruit; muesli\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.8, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "c7cPFf9xPehH"
+      },
+      "source": [
+        "### Language Translation"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wxpuMHT2PehN"
+      },
+      "source": [
+        "`temperature\": 0.5` give a good translation but translates wild animals as `rescued animals` and there is repetition"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Tg2usBAJPehN",
+        "outputId": "265ef19f-47bf-4496-e27d-77ea5e5ce8a4"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`It is good to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.5, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\"\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cQOj5tQQPehN"
+      },
+      "source": [
+        "`temperature\": 0.1` is the same as 0.5 with repetition\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ky2nMbzkPehN",
+        "outputId": "5e23a43a-85e8-4bc4-a30b-368e5e67d976"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`It is good to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.1, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\"\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sYYf7vJTPehN"
+      },
+      "source": [
+        "The default temperature setting of `1.0` also translates wild anaimals as rescued animals"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZY8uR7uoPeil",
+        "outputId": "97d49ea6-ce12-4d81-aac0-de8762f723eb"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`It is good to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.\n"
+          ]
+        }
+      ],
+      "source": [
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\"\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "v_8a2q-6Pein"
+      },
+      "source": [
+        "With temperature of 0.9 and 0.8 the model doesn't output very good translations\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FE0I7GNZPeip",
+        "outputId": "24836898-a0c3-4b70-c44a-b879d53ad454"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "The translation of the given sentence in French is `Il est bonne journée pour se déplaacer et jouer au football car cela fait bonjour. Après cela peut-être envisager d'aller dans le parc nation de spectateurs quel sont environnats`\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.9, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uNXWllG-Peiq",
+        "outputId": "937b2223-a95e-437a-ee57-679d6b6eca64"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "`This translation is not entirely accurate as it uses \"le joueur\" instead of \"je joue\" and does not include the last part of the sentence.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.8, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wSXcEOo8Peiq"
+      },
+      "source": [
+        "At 0.7 the model translated wild animals to resuced animals"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "sR-78LZYPeiq",
+        "outputId": "555d0a37-8b2d-4458-edca-ad8bd3a5508f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "The sentence translates to `Il est bon de sortir et jouer au football ce jour-la parce qu'il est bellement. Ensuite, il est possible de visiter le parc nationaume pour une randonnée en nature où voyager des animaux sauvés.`\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.7, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kADJvxjgPeiq"
+      },
+      "source": [
+        "At 0.6 the model didn't translate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZR1gQKxlPeiq",
+        "outputId": "f0840b29-cffc-4923-8d54-ed0fbfe873fe"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "The sentence in English reads: \"Today is a good day to go out and play football because it is sunny.\"\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"temperature\": 0.6, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lcZ8k-MdPeir"
+      },
+      "source": [
+        "## top_k"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zhQhU-gUB5b1"
+      },
+      "source": [
+        "The number of highest probability vocabulary tokens to keep for top-k-filtering"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "iJXcZiKePeir"
+      },
+      "source": [
+        "### Summarization"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KbsAZTiqB5b1"
+      },
+      "source": [
+        "`top_k=0` and `top_k=50` give similar results"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9D9R2b88B5b1",
+        "outputId": "557608f6-6eb1-4853-ac8e-00e6806b6fce"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in pruning models without accuracy loss\n",
+            "The difficulty in handling non-differential quantization\n",
+            "The researchers’ solution was to use distillation loss to achieve high sparsity levels.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OynMBCZRPeir"
+      },
+      "source": [
+        "Summary with `top_k=50` is quite concise"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "g9h1P8pfPeir",
+        "outputId": "94011f1d-a448-470b-cbb9-1f887c3fe929"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in pruning models without accuracy loss\n",
+            "The difficulty in handling non-differential quantization\n",
+            "The researchers’ solution was to use distillation loss to achieve high sparsity levels.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 50, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BDXkrTIDPeir",
+        "outputId": "b8458ae1-0161-4497-a415-2fb40fc83b6d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in pruning models without accuracy loss\n",
+            "The difficulty in handling non-differential quantization\n",
+            "The researchers’ solution was to use distillation loss to achieve high sparsity levels.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 50, \"max_new_tokens\": 300}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Rop3RId-Peir"
+      },
+      "source": [
+        "Summmary with `top_k=10` is too short"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xFPLLjmiPeir",
+        "outputId": "6d675524-a66f-422a-a616-d5bf064fcd9c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Both tasks involve compressing parameters without overfitting and achieving stateofferency performance in both cases\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 10, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Cmlj4EYQB5b2",
+        "outputId": "a3fe44b4-4bde-4a1d-a5c2-3f710c40125a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Tricks like unrolling or folding don in some cases do not work effectively for all architectures\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 20, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ilkBUmMgPeir",
+        "outputId": "4d6c97e0-6a14-47c5-81c5-0a526a821667"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "In contrast\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 40, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jc-49RTnPeir"
+      },
+      "source": [
+        "Summary at `top_k=60` is not clear."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LZcKc8ELPeis",
+        "outputId": "c3441b0d-b645-4072-9f93-c429b6e7aaac"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Difficulty in achieving competitive precision metrics because it tends towards 2 or 3 bits per parameter due FOSCO (Fosco Lab) Despite reducing precision up until now at 100%, FSPC still shows higher performance when optimizing offline and online quantizers (up until 2 iterations), including quantized layer drops (only 1 bit). A new record achieved 6 iterations thanks\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 60, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8_n5pWH2B5b2"
+      },
+      "source": [
+        "Summary at `top_k=70` is poor. Summaries at high top_k values is poor"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9cKRnMlvB5b2",
+        "outputId": "1567b2cc-4ed3-477f-c8ce-6d6af6948e95"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Smaller and cheaper hardware can result in performance drops due not accounting for underlying architecture differences between hardware types (such “green” ARM processors vs “gray” Intel CPUs) \n",
+            "\n",
+            "\n",
+            " As I mention\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 70, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MCi4jn-DB5b3",
+        "outputId": "f96c937d-a265-44f4-cd75-dc4dbf1e2845"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "To achieve state\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 80, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "JfXAJluNB5b3",
+        "outputId": "ac09eb66-76ba-442c-f41e-0abcb5e70e11"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The need\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 90, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DS4EKEbAPeit"
+      },
+      "source": [
+        "### Creative Writing"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LtB3F8OOB5b3"
+      },
+      "source": [
+        "`top_k=0` has repetitions:\n",
+        "`As the character continued to explore the new realm...`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IMzRddP_B5b3",
+        "outputId": "d47099e6-108b-4d14-d5cd-9fcbefe1efb6"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\n",
+            "The package arrived in the mail one morning, addressed to the character in an unfamiliar handwriting. The package was wrapped in black tape and sealed with a strange symbol etched into the lid. As the character opened the package, they found themselves staring at a strange crystal, unlike anything they’d ever seen before.\n",
+            "The crystal was translucent, and as the character held it in their hand, they could see the faint outlines of whatever was inside. The crystal seemed to pulse and glow, and the character felt a strange sensation in their chest.\n",
+            "As the character continued to examine the crystal, they realized that it was a portal to another dimension. The crystal was a gateway to a realm of infinite possibilities, and the character was suddenly filled with excitement and wonder.\n",
+            "The character spent the next few days exploring the new realm, discovering new worlds and meeting new beings. The experience was exhilarating, and the character felt alive and invigorated.\n",
+            "As the character continued to explore the new realm, they realized that the crystal was a portal to a realm of infinite possibilities. The world was full of wonder and excitement, and the character felt alive and invigor in the new realm.\n",
+            "As the character continued to explore the new realm, they realized that the crystal was a portal to a realm of infinite possibilities. The world was full of wonder\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KG_HLxy9Peit"
+      },
+      "source": [
+        "The story with `top_k=50` seem okay but there is some repetition which we will address later\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dAKG6IX4Peit",
+        "outputId": "a3d827e0-f70d-4226-9d05-b32d637fab4c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\n",
+            "The package arrived in the mail one morning, addressed to the character in an unfamiliar handwriting. The package was wrapped in black tape and sealed with a strange symbol etched into the lid. As the character opened the package, they found themselves staring at a strange crystal, unlike anything they’d ever seen before.\n",
+            "The crystal was translucent, and as the character held it in their hand, they could see the faint outlines of whatever was inside. The crystal seemed to pulse and glow, and the character felt a strange sensation in their chest.\n",
+            "As the character continued to examine the crystal, they realized that it was a portal to another dimension. The crystal was a gateway to a realm of infinite possibilities, and the character was suddenly filled with excitement and wonder.\n",
+            "The character spent the next few days exploring the new realm, discovering new worlds and meeting new beings. The experience was exhilarating, and the character felt alive and invigorated.\n",
+            "As the character continued to explore the new realm, they realized that the crystal was a portal to a realm of infinite possibilities. The world was full of wonder and excitement, and the character felt alive and invigor in the new realm.\n",
+            "As the character continued to explore the new realm, they realized that the crystal was a portal to a realm of infinite possibilities. The world was full of wonder\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 50, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ttM27vp5B5b3"
+      },
+      "source": [
+        "`top_k=80` has no repetition"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PPBmYNYHPeit",
+        "outputId": "3f19011c-6cd2-41a5-fc54-48ec4561c0a5"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "An object that has been lost for centuries is found by a person who hasn’t seen the object since they were a child. What is the object and how does it change their life?\n",
+            "A person discovers an abandoned house, locked door, or buried unknown personal belongings amidst its walls. How does this event change their life?\n",
+            "A young woman suddenly inherits her father’s old watch. What does this gift mean to her? Did it have sentimental or symbolic value? Did she wear the watch as often as her other jewels and accessories?. Were there any emotional repercussions from inheriting this particular item?.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 80, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pMkgehwxPeit"
+      },
+      "source": [
+        "### RAG"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Y2KNm5_gB5b4"
+      },
+      "source": [
+        "The results with default `top_k` of 0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Y81-H8FSPeit",
+        "outputId": "0992a843-71ca-472d-ac76-50f50d9384f0"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 126334.46it/s]\n",
+            "2023-10-26 06:18:23 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:18:23 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:18:35 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit, pannenkoeken, and ontbijtkoeken.\n"
+          ]
+        }
+      ],
+      "source": [
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH,\n",
+        "    model_config=model_config,\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QjOrygDsB5b4"
+      },
+      "source": [
+        "The results with `top_k=50` are the same as those with default value of `top_k`. The word `ontbijtkoeken` doesn't appear in the given documents"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "VlV4pxJuPeit",
+        "outputId": "356a2205-602c-491d-a41d-cf37f4391019"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 129453.83it/s]\n",
+            "2023-10-26 06:19:00 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:19:00 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:19:12 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit, pannenkoeken, and ontbijtkoeken.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 50, \"max_new_tokens\": 500}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1usUj7zbB5b4"
+      },
+      "source": [
+        "At `top_k=60` the model seems to try and respond in Dutch but some of the words don't make sense even in Dutch, here is the translation\n",
+        "\n",
+        "```\n",
+        "Typically Dutch breakerwoensesum consists of bredseed and large prawn anald Drenthe turned flower bulbs without summer zinnede garlic and cabbage point wind\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MjpRQzBJPei5",
+        "outputId": "f740e6e9-3db8-44a1-cd60-cdaa11ccb88a"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 125203.10it/s]\n",
+            "2023-10-26 06:19:39 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:19:39 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:19:50 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Typically dutch brekerwoensetsum consists og bredsedie en grote smaardanaald drentse omgekekte bloembolten zonder zomerzinnede knoflook en koolepuntvoorwind\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 60, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5Nq6PxXUB5b4"
+      },
+      "source": [
+        "At `70` the output doesn't make sense even when translated to English\n",
+        "\n",
+        "```\n",
+        "Bread that's baked like toast; bagels à la Paris; rondvooraants/frites; olives; fruits like kiataloosen; chocolate goods end and speculation large gelgere! Spare: breed for gulaai tasks! Licking/short cum teenage man/strengths will be bigger with help! Thank you\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "x6r2tyoYPei5",
+        "outputId": "d25e7072-41a0-4d4a-8615-464e9a730fa2"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 89240.51it/s]\n",
+            "2023-10-26 06:20:20 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:20:20 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:20:31 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Bread that's baked like toast; bagels à la Paris; rondvooraants/frites; olives; fruits like kiatalozen; chocoladewaren eind en speculatiegroot gelgere! Ontziet: bred voor van gulaaitaaken! Oploplikking/kort kom tienerman/strengents will groter zijn worden wordt met behulp! Dankelijk\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 70, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eeGQezmyB5b4"
+      },
+      "source": [
+        "At `top_k=90` the model also mentions items that don't apper in the given text such as corn flakes, cereal"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uCv1P6THPei5",
+        "outputId": "e614b551-12d3-4635-ab7d-1a2acda7f167"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 152520.15it/s]\n",
+            "2023-10-26 06:21:05 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:21:05 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:21:17 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Typically yogurt; fruits like bananas; cereal; corn flakes / rayu\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 90, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9YPlr78PB5b4"
+      },
+      "source": [
+        "At `top_k=99` we don't really get the breakfast options."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "GquzXMQXPei6",
+        "outputId": "c6afa2f3-33e4-40ff-be05-8e7ccd95ea1f"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 133576.56it/s]\n",
+            "2023-10-26 06:21:43 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:21:43 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:21:55 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Many different ways exist; besides traditional eating habits one could consider having cake (\"boosterkoken\")\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 99, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Lm9XMYIEPei6"
+      },
+      "source": [
+        "### Language Translation"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "w0Ne0OuSB5b5"
+      },
+      "source": [
+        "`top_k=0`\n",
+        "```\n",
+        "`It's good to go out and play football because it's a sunny day. After that, it is possible to visit the national park for a nature walk where it is possible to meet some rescued animals.`\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "i6GkIUzMB5b5",
+        "outputId": "8679a022-eb67-49c8-c8df-0200e1470761"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`It is good to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"max_new_tokens\": 300}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bxqDCPcAPei6"
+      },
+      "source": [
+        "At `top_k=50`\n",
+        "\n",
+        "```\n",
+        "`It's good to go out and play football because it's a sunny day. After that, it is possible to visit the national park for a nature walk where it is possible to meet some rescued animals.`\n",
+        "```\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WIqZooDbPei6",
+        "outputId": "93667215-706d-4a38-9449-06d6fa6c0369"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`It is good to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.`\n",
+            "\n",
+            "`Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés.`\n",
+            "\n",
+            "`Today is a good day to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_k\": 50, \"max_new_tokens\": 300}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JXwPmLl0Pei7"
+      },
+      "source": [
+        "## top_p"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sXnZb9yrB5b5"
+      },
+      "source": [
+        "Keep the generated tokens where its cumulative probability is >= top_p"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vrN0n0B6Pei7"
+      },
+      "source": [
+        "### Summarization"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BGb7IitidL2t"
+      },
+      "source": [
+        "`top_p: 1.0`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "p2ro3R4cPei7",
+        "outputId": "ceef1562-4908-4bcb-b7b0-0a43878906d8"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in pruning models without accuracy loss\n",
+            "The difficulty in handling non-differential quantization\n",
+            "The researchers’ solution was to use distillation loss to achieve high sparsity levels.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 1.0, \"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lc21ueBDdL2x"
+      },
+      "source": [
+        "`top_p: 0.90`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Oh_D6DKndL2x",
+        "outputId": "d6213900-1840-43c6-f316-3cf50a76ff66"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Researchers propose two solutions: \n",
+            "1) A dynamic loss function that adapts sparsity levels\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 0.90, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4DXhPs6TdL2x"
+      },
+      "source": [
+        "`top_p: 0.80`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "iLnpQjR-dL2x",
+        "outputId": "aa98dc18-cddc-4269-d1ed-5c8dae40b00e"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "To improve upon previous works in achieving state-of representations for machine translation\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 0.80, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KxCZ3hHedL2x"
+      },
+      "source": [
+        "`top_p: 0.70`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8clj7qYVdL2x",
+        "outputId": "8c41a42b-d676-4f00-dcad-0ca7a04108df"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The difficulty in handling variable sparsity levels during training and inference\n",
+            "To achieve high sparsity levels during\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 0.70, \"max_new_tokens\": 300, \"do_sample\": True}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "n2aTJYSKPei7"
+      },
+      "source": [
+        "### Creative Writing"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JQvwnzHfPei8"
+      },
+      "source": [
+        "The story with the default value pf `top_p`, i.e is 0, has no repetitions, writes a compeling story while giving the character a name"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dK3XDRL_Pei-"
+      },
+      "source": [
+        "At `top_p 1.0` the model doesn't repeart but also doesn't give a story but some ideas on how the story could play out"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "smiqCVBVPei-",
+        "outputId": "2b0e22b3-7003-45f4-a3f6-5d2e8593627d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "Our protagonist is an elderly woman named Edna, a widowed and reclusive retiree who lives alone in her small apartment. She has always been interested in mysteries, but generally hasn’t put much effort into pursuing them due (she believes) to her age and lack of interest. However, one day a mysterious package arrives at her doorstep that she otherwise wouldn't have thought to seek out. Despite initial apprehension, she becomes curious about the item and decides to pursue answers regarding its origin and nature by using available resources online. Over time Edna learns the significance of this object: it's a mysterious amulet that grants good luck for those who hold onto it amidst various personal trials on life. As she starts wearing the amulet herself when dealing with stressful situations (including helping others as well that request the amulet), Edna's outlook on life improves significantly; becoming happier in both her personal endeavors as well as interacting with people around her. She becomes more involved with friends than ever before, feeling inspired to actively help others due to witnessing the positive impacts of helping others while wearing the amulet herself - something she never would have pursued earlier without such an unexpected gift from unknown benefactors. The amulet also plays an important role in making connections between its wearers throughout humanity while serving as “a source of comfort”. The object transforms those who possess and wear it by boosting mental fortitude; giving new purpose-driven motivation; increasing self-confidence; promoting love for others; encouraging compassion towards oneself; motivating good deeds like charity work or acts of kindness towards others amidst difficult periods of life - ultimately inspiring happiness even amidst adversity through improved outlooks for people around them via symbolism connected to this singularly empowering artifact called “The Lucky Amulet”.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 1.0, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2vzBiuv7dL2x",
+        "outputId": "d8def96b-3b75-4fde-a8f5-1a7d09fd90a4"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "A young child has a nightmare that foreshadows something terrible happening in their real life. How\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 0.92, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ldhB5e_sdL2x"
+      },
+      "source": [
+        "`top_p: 0.80`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uRu_37mdB5b6",
+        "outputId": "fbb73034-9b0e-4300-ab82-65201f585f10"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "The object is a “chocolate egg”, which gives the character superpowers, allowing them to\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 0.80, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nwI9r-q-dL2y"
+      },
+      "source": [
+        "`top_p: 0.60`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5kdW1LyBdL2y",
+        "outputId": "ea701976-5639-4623-ecd5-69b58dbe5e0a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "The object is a magical amulet, which grants the character immense power and ability. They\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 0.60, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "f6i5W6vDdL2y"
+      },
+      "source": [
+        "`top_p: 0.50`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qDIbgCEsdL2y",
+        "outputId": "79f86fb8-8464-410d-db1a-ac2a28e1d8d6"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "The object is a key. The key unlocks a door that leads to a hidden room.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 0.50, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_XqAcOWadL2y"
+      },
+      "source": [
+        "`top_p: 0.30`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "i8ZbTLRJdL2y",
+        "outputId": "35ea68a7-b95b-432c-fe97-3526880aba37"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "A young woman discovers that she has the power to control gravity. She uses this power to help\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"top_p\": 0.30, \"do_sample\": True, \"max_new_tokens\": 500}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Wr_59AVDPei_"
+      },
+      "source": [
+        "### RAG"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "76XRSfCqdL2y"
+      },
+      "source": [
+        "`top_p: 1.0`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zL5KvSkYPei_",
+        "outputId": "42be6183-67de-44f4-8d4b-37d0f70a9590"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 125955.08it/s]\n",
+            "2023-10-26 06:24:04 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:24:04 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:24:16 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit, pannenkoeken, and ontbijtkoeken.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 1.0,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Afm4pOhFdL2y"
+      },
+      "source": [
+        "`top_p: 0.92`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nMJgzPeedL2y",
+        "outputId": "d64071b9-5ccb-498e-fc8c-e2976d1f6caf"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 73584.28it/s]\n",
+            "2023-10-31 08:31:38 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit (crispbakes) can be topped with various fruits or whipped cream for dessert\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 0.92,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "    \"do_sample\": True,\n",
+        "}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZsqMepIVdL2y"
+      },
+      "source": [
+        "`top_p: 0.80`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9XORCw3pdL2y",
+        "outputId": "cee1d51d-9437-47da-e773-1e3f2a6223a2"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 90394.48it/s]\n",
+            "2023-10-31 08:32:28 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit (Dutch crisp bakes) are typically eaten for breakfast in the Netherlands. Pann\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 0.80,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "    \"do_sample\": True,\n",
+        "}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E2Gdm743dL2z"
+      },
+      "source": [
+        "`top_p: 0.70`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "18WYQ7AndL2z",
+        "outputId": "24d53104-dc70-4bc9-8289-0c30734e8ac6"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 137970.53it/s]\n",
+            "2023-10-31 08:33:10 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit (a savory cake) or pancake (pannenkoeken) with\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 0.70,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "    \"do_sample\": True,\n",
+        "}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UPyML--RdL2z"
+      },
+      "source": [
+        "`top_p: 0.50`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "VZTediFRdL2z",
+        "outputId": "4cb51f85-9d1b-4535-eb5b-f35f949ce8c6"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 127486.44it/s]\n",
+            "2023-10-31 08:33:48 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit (Dutch crisp bakes) is also eaten as a breakfast food, with the same\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 0.50,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "    \"do_sample\": True,\n",
+        "}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CaSalKYFPei_"
+      },
+      "source": [
+        "### Language Translation"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uo_szl4XB5b6"
+      },
+      "source": [
+        "Translation with `top_p=0`\n",
+        "\n",
+        "```\n",
+        "It's good to go out and play football because it's sunny. After that, it is possible to visit the national park for nature hiking and seeing wild animals.\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tQbcMWtbB5b6",
+        "outputId": "fe70fdbf-eceb-46a2-e26e-e18fe6a8b4ae"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "English: Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.\n",
+            "\n",
+            "French: Il est bon de sortir et jouer au football parce qu'il est soleil. Après cela, il est possible de visiter le parc national pour une randonnée en nature et voir des animaux sauvages.\n",
+            "\n",
+            "Translation: It is good to go out and play football because it is sunny. After that, it is possible to visit the national park for a nature walk while seeing some wild animals.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\"max_new_tokens\": 300}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French 'Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.'\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xU4DTYrOPei_"
+      },
+      "source": [
+        "`top_p: 1.0`\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_mR4_wT4Pei_",
+        "outputId": "1fa83eb9-16cc-49e4-caab-91fb3f9bb571"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "English: Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.\n",
+            "\n",
+            "French: Il est bon de sortir et jouer au football parce qu'il est soleil. Après cela, il est possible de visiter le parc national pour une randonnée en nature et voir des animaux sauvages.\n",
+            "\n",
+            "Translation: It is good to go out and play football because it is sunny. After that, it is possible to visit the national park for a nature walk while seeing some wild animals.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 1.0,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French 'Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.'\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xY19OGTJdL2z"
+      },
+      "source": [
+        "`top_p: 0.92`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HD-90KgLdL2z",
+        "outputId": "e0e6839c-6048-4371-b5ba-9756faf029a6"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "Translation: Il est bon de sortir et jouer au football aujourd'hui\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 0.92,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "    \"do_sample\": True,\n",
+        "}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French 'Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.'\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CgcRnf2OdL2z"
+      },
+      "source": [
+        "`top_p: 0.80`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "r7io4FnMdL2z",
+        "outputId": "2bb426c6-eef0-480b-cb83-517ce263fe2f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "'Today is a good day to go out and play football because it is sunny. After\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 0.80,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "    \"do_sample\": True,\n",
+        "}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French 'Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.'\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HpuIAnCKdL2z"
+      },
+      "source": [
+        "`top_p: 0.70`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LFGREJfvdL2z",
+        "outputId": "9750e24f-13de-4710-af67-8cbf501c23a0"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "Translated to French: 'Aujourd'hui est un bon journée pour\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 0.70,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "    \"do_sample\": True,\n",
+        "}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French 'Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.'\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KEJu2gOWdL2z"
+      },
+      "source": [
+        "`top_p: 0.50`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "GDmhqLv7dL2z",
+        "outputId": "5409f8f1-8b22-4f55-8e45-213cf7152767"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "English: Today is a good day to go out and play football because it is sunny.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"top_p\": 0.50,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "    \"do_sample\": True,\n",
+        "}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French 'Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.'\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jdoVfl9pPei_"
+      },
+      "source": [
+        "## Repetition Penalty"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "S00wSq5OB5b6"
+      },
+      "source": [
+        "Penalty applied for generating new token. Existing token frequencies summed to subtraction the logit of its corresponding logit value"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HvikePzCPei_"
+      },
+      "source": [
+        "### Summarization"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "czKqtqihPei_",
+        "outputId": "24d0a446-601b-41cc-c6c8-b560bdad34c0"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Large or many layers often lead\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"repetition_penalty\": 1.0,\n",
+        "    \"do_sample\": True,\n",
+        "    \"max_new_tokens\": 300,\n",
+        "}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HP06ROLHdL20"
+      },
+      "source": [
+        "`repetition_penalty: 2.0`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vvwANSZpdL20",
+        "outputId": "3d725518-0675-48d4-f424-c341f32f62b2"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Efficient approximations were used such and soft attention matrices did poorly when evaluated against dense alternatives\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"repetition_penalty\": 2.0,\n",
+        "    \"do_sample\": True,\n",
+        "    \"max_new_tokens\": 300,\n",
+        "}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"\"\"\n",
+        "Write a concise summary of the following:\n",
+        "\n",
+        "Sparse Finetuning for Inference Acceleration of Large Language Models\n",
+        "\n",
+        "Fine-tuning large language models to obtain a small but accurate model is extremely difficult. This is because you have to strike a balance between the model’s size and accuracy. Researchers from IST Austria & Neural Magic seem to have found a sweet spot. In their latest paper, they successfully applied sparse fine-tuning on MPT with remarkable performance. The MPT model was pruned to 75% without a drop in accuracy, showing performance that is on par with quantization approaches.\n",
+        "\n",
+        "Particularly, the resulting sparse model can execute fast on CPUs by taking advantage of sparsity.  Instead of performing standard loss-based fine-tuning which may fail to recover accuracy, the researchers experiment with distillation-type losses. These losses are better at recovering accuracy at high sparsity.\n",
+        "What’s impressive is that the sparse fine-tuned model achieves 7.7 tokens per second on a single core and 26.7 tokens per second on 4 cores of a cheap consumer AMD Ryzen CPU.\n",
+        "This post will dive into more details from this paper.\n",
+        "The researchers aim to address the high cost of running large language models. One of the most popular techniques for doing so is quantization where the precision of the weights is reduced to 4 bits. However, at around 3 bits per weight, it becomes hard to recover accuracy.\n",
+        "\n",
+        "Introducing weight sparsity is an alternative to quantization where certain connections in the network are set to zero. The sparsity introduced during fine-tuning leads to a significantly faster model. In this paper, the authors study sparse fine-tuning for large language models for the following applications:\n",
+        "Speech transcription using Whisper\n",
+        "Machine translation using T5\n",
+        "Higher-level reasoning using the open GPT-type MPT model\n",
+        "Challenges of Large Language Models\n",
+        "When fine-tuning the large language models the researchers faced several challenges which they resolved. The challenges came from the fact that the:\n",
+        "The fine-tuning data may not be as large as the training data\n",
+        "The desire to achieve high sparsity levels\n",
+        "\"\"\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0Iqh6QyRPejA"
+      },
+      "source": [
+        "### Creative Writing"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "svjujaA7PejA"
+      },
+      "source": [
+        "With no repetition penalty the model repeats the phrases\n",
+        "As the character\n",
+        "excitement and wonder\n",
+        "\n",
+        "But at with repetition penalty of 2.0 the model seems to write a story with no repetition:\n",
+        "```\n",
+        "\n",
+        "The character discovers that the object is actually some sort of magical artifact. They begin to experience powers and abilities corresponding with the artifact, which are unpredictable at first but become predictable over time. The mystery gradually opens up into an epic quest complete with battles against evil forces trying to seize control of the artifact for sinister purposes, which eventually culminates in saving humanity from peril in one final desperate effort. Ultimately, after having confronted numerous challenges along the way, including moments where seemingly impossible obstacles stood in their path or enemies threatening death or degradation if they were consumed by greed for power --- even when these encounters threatened losing everything dear to them --- they arrive at a satisfying resolution by leveraging their newfound magic into fighting back against those who sought its possession as well: namely those bent on conquest amidst chaos raging across nations as opposed to peaceful society (i.); hence ensuring that prosperity was guarded by heroically defending principles vital to society’s wellbeing; all while helping others overcome adversity through compassion and empathy toward other people suffering alongside them\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "17nR4q-OB5b7"
+      },
+      "source": [
+        "In the text below there repetition of the phrase `An elderly man uses his understanding`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3-gOi2HtPejA",
+        "outputId": "90179bf1-6da0-4f3c-861e-775927014630"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "As a result of receiving the object, the character sees and experiences new things that they otherwise wouldn’t have encountered. Perhaps they discover a new passion or hobby, or maybe they become more compassionate and empathetic due Ingsights, though these experiences are different for each character.\n",
+            "In this story, an elderly man receives instructions on how to access his futureself in order as he faces retirement with his current self. Being empowered with insights into his future self helps him make sound retirement decisions today so he can enjoy retirement without stressing about finances.\" /> The elderly man uses his understanding of his future self to make sound retirement decisions today so he can enjoy retirement without stressing about finances.\" />\"> An elderly man uses his understanding of his future self to make sound retirement decisions today so he can enjoy retirement without stressing about finances.\"> /></submitting\">An elderly man uses his understanding of his future self to make sound retirement decisions today so he can enjoy retirement without stressing\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"repetition_penalty\": 1.0,\n",
+        "    \"do_sample\": True,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DfEMOLYSB5b7"
+      },
+      "source": [
+        "The text below contains no repetitions:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ux-WMW9OPejA",
+        "outputId": "7b932bec-a72c-45e1-a24b-b71562fc4420"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "The object could be magical, causing the recipient to experience new powers or abilities related to magic or technology. Perhaps the recipient uses this object in conjunction with existing abilities, enhancing them in some way. Or maybe the object has dangerous potential; perhaps this new ability comes with responsibilities and dangers that come along with it.\n",
+            "Alternatively, the item could have historical significance; perhaps someone from another time who helped protect magic/technology ends up getting caught up in human history and becomes relevant once again through these objects sent from that era into our world today through some unforeseen connection of fate (or possibly a technological breakthrough). In any case, there must be consequences for receiving such an unexpected gift as these are often not without explanation – leaving us guessing what otherworldly events may emerge due The Unknown Gift! It’s anyone's guess what lies ahead as we delve into unexplored territory! Maybe whoever knows about The Unknown Gift will also know more information about ancient prophecy at war? We'll see!\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"repetition_penalty\": 2.0,\n",
+        "    \"do_sample\": True,\n",
+        "    \"max_new_tokens\": 300,\n",
+        "}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "trsJl2yxB5b7",
+        "outputId": "0a55402c-7ff2-4e84-f0bc-e1113120d293"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "As the mystery unfolds, our main character discovers that the object can be used as a weapon for good or evil purposes. If you’re looking for deep plot lines, this series is perfect!\n",
+            "The main character must choose: whether to use the mystical item in their possession to fight crime and injustice or help those caught by catastrophic events such Militarized police response teams against terrorist attacks on innocent civilians abroad. If you want stories of heroism this series is ideal!\n",
+            "If you are looking for plots involving magic artifacts that bring peril with them; This may not be ideal if your aim is to avoid complex moral dilemmas & decisions which challenge conventional norms of behavior in society today.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"repetition_penalty\": 2.0,\n",
+        "    \"do_sample\": True,\n",
+        "    \"max_new_tokens\": 300,\n",
+        "}\n",
+        "\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"A character receives a mysterious package containing an object they’ve never seen before. What is it and how does it change their life?\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OV11R0OBPejA"
+      },
+      "source": [
+        "### RAG"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "78rWfEBYPejA",
+        "outputId": "96991c4e-78c0-4390-ed21-b09fd7e48d6c"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 135300.13it/s]\n",
+            "2023-10-26 06:25:38 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:25:38 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:25:51 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit, pannenkoeken, and ontbijtkoeken.\n"
+          ]
+        }
+      ],
+      "source": [
+        "model_config = {\"sequence_length\": 2048}\n",
+        "generation_config = {\"max_new_tokens\": 500}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pRq_68p4PejA",
+        "outputId": "5f997565-566b-480a-c423-a3db97f8b6ba"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 132312.43it/s]\n",
+            "2023-10-26 06:26:17 deepsparse.transformers.pipelines.text_generation INFO     Compiling an auxiliary engine to process a prompt with a larger processing length. This improves performance, but may result in additional memory consumption.\n",
+            "2023-10-26 06:26:17 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n",
+            "2023-10-26 06:26:30 deepsparse.utils.onnx INFO     Overwriting in-place the input shapes of the transformer model at /home/mwiti/.cache/huggingface/hub/models--neuralmagic--mpt-7b-chat-pruned50-quant/snapshots/a1b59e5acd426be155761950cc9ac297635616bf/model.onnx\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Beschuit, pannenkoeken, and ontbijtkoeken.\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"repetition_penalty\": 1.0,\n",
+        "    #  \"do_sample\": True,\n",
+        "    \"max_new_tokens\": 500,\n",
+        "}\n",
+        "model_config = {\"sequence_length\": 2048}\n",
+        "llm = DeepSparse(\n",
+        "    model=MODEL_PATH, model_config=model_config, generation_config=generation_config\n",
+        ")\n",
+        "chain = RetrievalQA.from_chain_type(\n",
+        "    llm,\n",
+        "    chain_type=\"stuff\",\n",
+        "    return_source_documents=True,\n",
+        "    retriever=docsearch.as_retriever(),\n",
+        ")\n",
+        "res = chain({\"query\": \"What are some Dutch breakfast options?\"})\n",
+        "answer = res[\"result\"]\n",
+        "print(answer)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_Ntp9SLdPejA"
+      },
+      "source": [
+        "### Language Translation"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0Dd1No2gPejA"
+      },
+      "source": [
+        "repetition_penalty 1.0:\n",
+        "```\n",
+        "I translate this sentence into French as `Today is a good day to play sports games in large areas of the national park to walk in landscapes with real animals.’\n",
+        "```\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-cNVgqqEPejA",
+        "outputId": "29aeaa22-86be-427a-c52c-0a2a077ed898"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " I translate this sentence in French as `Ce jour est un bonne journée pour allumer en jeu sportif dans de larges espaces du parc nationaume pour se promener dans des paysages avec de vrais animaux.’\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"repetition_penalty\": 1.0,\n",
+        "    \"do_sample\": True,\n",
+        "    \"max_new_tokens\": 300,\n",
+        "}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kd1PCl1zB5b8"
+      },
+      "source": [
+        "Repetition penalty 2.0:\n",
+        "\n",
+        "```\n",
+        "\"Ah today Thursday is a good day for the celebration of the sport within the framework of a football match, established on this basis are real which was made to be today also appropriate park environment; the be will have to visit also was an old equivalence that a country could offer.\"\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oWJ67qLoB5b8",
+        "outputId": "4e48f3d5-6999-40fc-e420-407f24e3e665"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "I would translate this sentence as follows:  \n",
+            "\"Ah today jeudi est un bon jour à cause la célébration du sport dans le cadre d'un match de football, établis sur cette base sont réelles qui était fait pour être aujourd'hui également opportun environnement de parc; l’être devront visiter également était une vieille équivalence que désirous pouvant offrir un pays.\"\n"
+          ]
+        }
+      ],
+      "source": [
+        "generation_config = {\n",
+        "    \"repetition_penalty\": 2.0,\n",
+        "    \"do_sample\": True,\n",
+        "    \"max_new_tokens\": 300,\n",
+        "}\n",
+        "result = text_pipeline(\n",
+        "    prompt=\"Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`\",\n",
+        "    generation_config=generation_config,\n",
+        ")\n",
+        "print(result.generations[0].text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2i-vVFHqB5b8"
+      },
+      "source": [
+        "## Conclusion"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U-4q4u1jB5b8"
+      },
+      "source": [
+        "### Temperature\n",
+        "Increasing the temperature for creative tasks is a good idea to increase variety in the words that are selected by the model. This will ensure that the generated text is interesting because the model doesn't always choose the same words, which is the case for a greedy approach.  \n",
+        "\n",
+        "The same behaviour may not be desired for RAG applications where you want the model to answer the questions from the provided text. For these type of applications, lowering the temperature may be more desireable. The same can be said for translation tasks since you don't want the model to get \"creative\" but to be more \"confident\" in its responses.\n",
+        "\n",
+        "In summarization, you also, don't want the model to get too creative but to summarize the given text, so a lower temperature may be better.\n",
+        "\n",
+        "### `top_k`\n",
+        "`top_k` is an important parameter, particulary for creative tasks. As seen in the notebook, the model repeats certain phrases and sentences when restricted to only a few `top_k` words but doesn't repeat when the `top_k` is increased. Increasing the number reduces repetition. You can increase this number gradually and observe if the story remains coherent.\n",
+        "\n",
+        "Best results for RAG were obtained with the default `top_k` value of 0. Since this is not a creative task, the results from this notebook indicate that increasing the value of `top_k` leads to misleading answers that are not even in the given text.\n",
+        "\n",
+        "For summarization, increasing the word pool by increasing the value of `top_k` leads to poor summaries, where the model sometimes generates text that not related to the given content.\n",
+        "\n",
+        "Using a high `top_k` such as 300 for translation leads to extremely poor results where the translated text is not related to the original text. This can be attributed to the fact that the model becomes more \"creative\" since there are so many words to pick from. Therefore, for translation, task it's better to keep this number low. In this notebook 50, gave reasonable results.\n",
+        "\n",
+        "### `top_p`\n",
+        "`top_p` sampling is a strategy for dynamically choosing the value of `k` as long the cumulative probability of the chosen words exceeds `p`. The model will choose the least number of words that exceed the chosen probability, making the number of words dynamic. For instance, if you pick p as 0.8. The probability of words picked can be 0.5+0.2+0.1 or 0.3+0.3+0.2.\n",
+        "\n",
+        "### Repetition Penalty\n",
+        "It is a good idea penalize the model for repetition in creative tasks to make sure it doesn't keep repeating the same phrases. As seen earlier, in the notebook, we were able to get stories without repetition when we penalized the model for repetition.\n",
+        "\n",
+        "Repetition doesn't seem to be a problem for RAG,summarization and translation tasks. In fact, from this notebook, it looks like penalizing the model for repetition can lead to poor performance in translation tasks especially when certain words appear severally in the sentence.\n",
+        "\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.12"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

From a2922a51cadb5f6b79f34e0ff5714cb04a2f9f96 Mon Sep 17 00:00:00 2001
From: Derrick Mwiti <mwitiderrick@gmail.com>
Date: Fri, 3 Nov 2023 12:56:00 +0300
Subject: [PATCH 2/8] add readme

---
 notebooks/generate-text/readme.md | 58 +++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 notebooks/generate-text/readme.md

diff --git a/notebooks/generate-text/readme.md b/notebooks/generate-text/readme.md
new file mode 100644
index 0000000..aef4a2b
--- /dev/null
+++ b/notebooks/generate-text/readme.md
@@ -0,0 +1,58 @@
+# How to Generate Text on CPUs With DeepSparse
+
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neuralmagic/notebooks/blob/main/notebooks/generate-text/generate.ipynb)
+
+This notebook walks through different strategies for generating text using DeepSparse on CPUs.
+
+## Installation
+```bash
+pip install deepsparse-nightly[llm] langchain sentence-transformers chromadb datasets
+```
+## Generate Text
+```python
+from deepsparse import TextGeneration
+
+MODEL_PATH = "hf:neuralmagic/mpt-7b-chat-pruned50-quant"
+
+text_pipeline = TextGeneration(model_path=MODEL_PATH, sequence_length=2048)
+
+generation_config = {"top_k": 50, "max_new_tokens": 300}
+
+result = text_pipeline(
+    prompt="Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`",
+    generation_config=generation_config,
+)
+print(result.generations[0].text)
+
+"""Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés"""
+
+```
+## Text Generation Parameters
+[DeepSparse](https://github.com/neuralmagic/deepsparse/) allows to set different text generation parameters. 
+
+### Temperature
+
+The temperature parameter is used to MODIFY the logits. The logits are passed to the softmax function to turn them into probabilities, making it easier to pick the next word. 
+
+When the temperature is 1, the operation is the same as that of a normal softmax operation. 
+
+When the temperature is high, the model becomes more random hence associated with being more creative. 
+
+When the temperature is low the model becomes more conservative hence more confident with its responses. 
+
+### Top K
+In `top_k` sampling, the user sets the top number of words that the model will sample from. For example, if K is 50, the model will sample from the top 50 words. 
+
+The problem with this is that you have to manually choose the K, meaning some words are left out. This may not be ideal for some use cases such as creative writing.
+
+### Top P
+In `top_p` sampling, the value of K is set dynamically by setting a desired probability.
+
+The model will choose the least number of words that exceed the chosen probability, making the number of words dynamic.
+
+For instance, if you pick p as 0.8. The probability of words picked can be 0.5+0.2+0.1 or 0.3+0.3+0.2.
+
+### Repetition Penalty
+Repetition penalty is an important parameter that ensures the model doesn't repeat certain words or phrases.
+
+Setting it to 1, means that there is no penalty. For example, in creative writing, you can penalize the model for repeating phrases that recently appeared in the text. 

From 02e071729d449706aac12c0cc04b0aa55a22fe0b Mon Sep 17 00:00:00 2001
From: Derrick Mwiti <mwitiderrick@gmail.com>
Date: Fri, 3 Nov 2023 13:00:44 +0300
Subject: [PATCH 3/8] update readme

---
 notebooks/generate-text/readme.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/notebooks/generate-text/readme.md b/notebooks/generate-text/readme.md
index aef4a2b..c8b2750 100644
--- a/notebooks/generate-text/readme.md
+++ b/notebooks/generate-text/readme.md
@@ -19,7 +19,7 @@ text_pipeline = TextGeneration(model_path=MODEL_PATH, sequence_length=2048)
 generation_config = {"top_k": 50, "max_new_tokens": 300}
 
 result = text_pipeline(
-    prompt="Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider, visiting the national park for a nature walk while seeing some wild animals.`",
+    prompt="Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.`",
     generation_config=generation_config,
 )
 print(result.generations[0].text)
@@ -28,11 +28,11 @@ print(result.generations[0].text)
 
 ```
 ## Text Generation Parameters
-[DeepSparse](https://github.com/neuralmagic/deepsparse/) allows to set different text generation parameters. 
+[DeepSparse](https://github.com/neuralmagic/deepsparse/) allows you to set different text generation parameters. 
 
 ### Temperature
 
-The temperature parameter is used to MODIFY the logits. The logits are passed to the softmax function to turn them into probabilities, making it easier to pick the next word. 
+The temperature parameter is used to modify the logits. The logits are passed to the softmax function to turn them into probabilities, making it easier to pick the next word. 
 
 When the temperature is 1, the operation is the same as that of a normal softmax operation. 
 

From 23c589234bced547bc24eb29f2d18cf0a812393e Mon Sep 17 00:00:00 2001
From: Derrick Mwiti <mwitiderrick@gmail.com>
Date: Fri, 3 Nov 2023 13:03:57 +0300
Subject: [PATCH 4/8] update readme

---
 notebooks/generate-text/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/generate-text/readme.md b/notebooks/generate-text/readme.md
index c8b2750..7cd6467 100644
--- a/notebooks/generate-text/readme.md
+++ b/notebooks/generate-text/readme.md
@@ -28,7 +28,7 @@ print(result.generations[0].text)
 
 ```
 ## Text Generation Parameters
-[DeepSparse](https://github.com/neuralmagic/deepsparse/) allows you to set different text generation parameters. 
+[DeepSparse](https://github.com/neuralmagic/deepsparse/) supports different [text generation parameters](https://github.com/neuralmagic/deepsparse/blob/main/src/deepsparse/transformers/text_generation.md), including:
 
 ### Temperature
 

From 93c493fd0cd47144b97d92dce59f891f05a089da Mon Sep 17 00:00:00 2001
From: Derrick Mwiti <mwitiderrick@gmail.com>
Date: Fri, 3 Nov 2023 13:10:41 +0300
Subject: [PATCH 5/8] update notebook

---
 notebooks/generate-text/generate.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/generate-text/generate.ipynb b/notebooks/generate-text/generate.ipynb
index 43269f9..f56c496 100644
--- a/notebooks/generate-text/generate.ipynb
+++ b/notebooks/generate-text/generate.ipynb
@@ -8,7 +8,7 @@
       "source": [
         "# How to Generate Text on CPUs Using Different Decoding Strategies for Language Models With DeepSparse\n",
         "\n",
-        "This notebook walks through different strategies for generating text using DeepSparse on CPUs.Read the accompanying blog post on the [Neural Magic website](https://neuralmagic.com/blog/). "
+        "This notebook walks through different strategies for generating text using DeepSparse on CPUs. Read the accompanying blog post on the [Neural Magic website](https://neuralmagic.com/blog/). "
       ]
     },
     {

From c0a6ff91978607ef74479aa97fdf0d3ff3c5f9bb Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 3 Nov 2023 21:27:06 -0600
Subject: [PATCH 6/8] Update readme.md

---
 notebooks/generate-text/readme.md | 51 ++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/notebooks/generate-text/readme.md b/notebooks/generate-text/readme.md
index 7cd6467..c3052c2 100644
--- a/notebooks/generate-text/readme.md
+++ b/notebooks/generate-text/readme.md
@@ -1,23 +1,29 @@
-# How to Generate Text on CPUs With DeepSparse
+# How to Control Text Generation with DeepSparse
 
 [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neuralmagic/notebooks/blob/main/notebooks/generate-text/generate.ipynb)
 
 This notebook walks through different strategies for generating text using DeepSparse on CPUs.
 
 ## Installation
+
+To install the necessary libraries, run the following command:
+
 ```bash
 pip install deepsparse-nightly[llm] langchain sentence-transformers chromadb datasets
 ```
+
 ## Generate Text
 ```python
 from deepsparse import TextGeneration
 
+# Define the model path and create a text generation pipeline
 MODEL_PATH = "hf:neuralmagic/mpt-7b-chat-pruned50-quant"
-
 text_pipeline = TextGeneration(model_path=MODEL_PATH, sequence_length=2048)
 
+# Configure generation parameters
 generation_config = {"top_k": 50, "max_new_tokens": 300}
 
+# Generate text based on a prompt
 result = text_pipeline(
     prompt="Translate the following sentence to French `Today is a good day to go out and play football because it is sunny. After that, you can consider visiting the national park for a nature walk while seeing some wild animals.`",
     generation_config=generation_config,
@@ -25,34 +31,45 @@ result = text_pipeline(
 print(result.generations[0].text)
 
 """Il est bon de sortir et jouer au football parce qu’il est jour de soleil. Après cela, il est possible de visiter le parc national pour une balade dans la nature où il est possible de rencontrer certes animaux sauvés"""
-
 ```
-## Text Generation Parameters
+
+For more detailed examples and parameter explanations, please refer to the notebook.
+
+## Understanding Generation Parameters
 [DeepSparse](https://github.com/neuralmagic/deepsparse/) supports different [text generation parameters](https://github.com/neuralmagic/deepsparse/blob/main/src/deepsparse/transformers/text_generation.md), including:
 
 ### Temperature
 
-The temperature parameter is used to modify the logits. The logits are passed to the softmax function to turn them into probabilities, making it easier to pick the next word. 
+Temperature is a hyperparameter that controls the randomness of predictions by scaling the logits before applying softmax. When set to 1, the model behaves normally, sampling each word according to its probability. Lower temperatures lead to less randomness and more confident outputs, while higher temperatures encourage diversity and creativity in the text generated.
 
-When the temperature is 1, the operation is the same as that of a normal softmax operation. 
+Here's a more detailed explanation of its effects:
+- **High Temperature (e.g., >1):** The model's outputs become more random and potentially more creative. It's like heating the decision space - more words get a chance to be chosen, even those with lower initial probabilities.
+- **Low Temperature (e.g., <1):** The model's outputs become more deterministic. Lower temperatures effectively sharpen the distribution, making the model more conservative and more likely to repeat the most probable sequences of words.
 
-When the temperature is high, the model becomes more random hence associated with being more creative. 
+### Top K
 
-When the temperature is low the model becomes more conservative hence more confident with its responses. 
+The `top_k` sampling parameter restricts the model's choice to the K most likely next words. Setting `top_k` to 50, for example, means that the model only considers the top 50 words sorted by probability to continue the sequence for each step in the generation.
 
-### Top K
-In `top_k` sampling, the user sets the top number of words that the model will sample from. For example, if K is 50, the model will sample from the top 50 words. 
+Pros and Cons:
+- **Pros:** By constraining the model's choices, `top_k` sampling often leads to more coherent and contextually appropriate text.
+- **Cons:** It can exclude potentially fitting choices, especially when the set K is small, limiting creativity and variability in scenarios like storytelling or poetry generation.
 
-The problem with this is that you have to manually choose the K, meaning some words are left out. This may not be ideal for some use cases such as creative writing.
+### Top P (Nucleus Sampling)
 
-### Top P
-In `top_p` sampling, the value of K is set dynamically by setting a desired probability.
+Nucleus sampling, or `top_p` sampling, dynamically determines the number of words to consider by choosing from the smallest set whose cumulative probability exceeds the threshold P. This means it looks at the top probabilities and picks enough words to cover the cumulative probability of P.
 
-The model will choose the least number of words that exceed the chosen probability, making the number of words dynamic.
+For example, if `top_p` is set to 0.8, the model will sum the probabilities from the highest down until it adds up to 0.8, and then sample only from this subset of words.
 
-For instance, if you pick p as 0.8. The probability of words picked can be 0.5+0.2+0.1 or 0.3+0.3+0.2.
+Pros and Cons:
+- **Pros:** This approach allows for dynamic variability, balancing the randomness and determinism based on the actual probability distribution of the next word.
+- **Cons:** It may occasionally include very improbable words if they are part of the cumulative set that reaches the desired probability threshold.
 
 ### Repetition Penalty
-Repetition penalty is an important parameter that ensures the model doesn't repeat certain words or phrases.
 
-Setting it to 1, means that there is no penalty. For example, in creative writing, you can penalize the model for repeating phrases that recently appeared in the text. 
+The repetition penalty parameter helps prevent the model from repeating the same words and phrases, enhancing the text's readability and originality. A penalty of 1.0 means no penalty is applied, and as the value increases, the model becomes less likely to repeat recent words.
+
+Application:
+- **Creative Writing:** Increasing the repetition penalty can help produce more diverse and interesting text by discouraging the model from reusing the same language.
+- **Informational Text:** A lower or no repetition penalty may be appropriate when the repetition of certain terms is necessary for clarity or emphasis.
+
+By tuning these parameters, you can steer the text generation process to produce outputs that are aligned with your specific goals, whether that be creating novel content or generating precise and informative text.

From 40c640c20e1118c2af194060b2cb6224c96cfcf5 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 3 Nov 2023 21:27:57 -0600
Subject: [PATCH 7/8] Update readme.md

---
 notebooks/generate-text/readme.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/notebooks/generate-text/readme.md b/notebooks/generate-text/readme.md
index c3052c2..5f6fca9 100644
--- a/notebooks/generate-text/readme.md
+++ b/notebooks/generate-text/readme.md
@@ -42,7 +42,6 @@ For more detailed examples and parameter explanations, please refer to the noteb
 
 Temperature is a hyperparameter that controls the randomness of predictions by scaling the logits before applying softmax. When set to 1, the model behaves normally, sampling each word according to its probability. Lower temperatures lead to less randomness and more confident outputs, while higher temperatures encourage diversity and creativity in the text generated.
 
-Here's a more detailed explanation of its effects:
 - **High Temperature (e.g., >1):** The model's outputs become more random and potentially more creative. It's like heating the decision space - more words get a chance to be chosen, even those with lower initial probabilities.
 - **Low Temperature (e.g., <1):** The model's outputs become more deterministic. Lower temperatures effectively sharpen the distribution, making the model more conservative and more likely to repeat the most probable sequences of words.
 
@@ -50,7 +49,6 @@ Here's a more detailed explanation of its effects:
 
 The `top_k` sampling parameter restricts the model's choice to the K most likely next words. Setting `top_k` to 50, for example, means that the model only considers the top 50 words sorted by probability to continue the sequence for each step in the generation.
 
-Pros and Cons:
 - **Pros:** By constraining the model's choices, `top_k` sampling often leads to more coherent and contextually appropriate text.
 - **Cons:** It can exclude potentially fitting choices, especially when the set K is small, limiting creativity and variability in scenarios like storytelling or poetry generation.
 
@@ -60,7 +58,6 @@ Nucleus sampling, or `top_p` sampling, dynamically determines the number of word
 
 For example, if `top_p` is set to 0.8, the model will sum the probabilities from the highest down until it adds up to 0.8, and then sample only from this subset of words.
 
-Pros and Cons:
 - **Pros:** This approach allows for dynamic variability, balancing the randomness and determinism based on the actual probability distribution of the next word.
 - **Cons:** It may occasionally include very improbable words if they are part of the cumulative set that reaches the desired probability threshold.
 
@@ -68,7 +65,7 @@ Pros and Cons:
 
 The repetition penalty parameter helps prevent the model from repeating the same words and phrases, enhancing the text's readability and originality. A penalty of 1.0 means no penalty is applied, and as the value increases, the model becomes less likely to repeat recent words.
 
-Application:
+Applications:
 - **Creative Writing:** Increasing the repetition penalty can help produce more diverse and interesting text by discouraging the model from reusing the same language.
 - **Informational Text:** A lower or no repetition penalty may be appropriate when the repetition of certain terms is necessary for clarity or emphasis.
 

From 954c267e6d4c9b1de8f9f4d096b34b0f5585f4dd Mon Sep 17 00:00:00 2001
From: Derrick Mwiti <mwitiderrick@gmail.com>
Date: Mon, 6 Nov 2023 19:54:50 +0300
Subject: [PATCH 8/8] update notebook name

---
 ...> How-to-Control-Text-Generation-With-DeepSparse.ipynb} | 7 +++++++
 notebooks/generate-text/readme.md                          | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)
 rename notebooks/generate-text/{generate.ipynb => How-to-Control-Text-Generation-With-DeepSparse.ipynb} (99%)

diff --git a/notebooks/generate-text/generate.ipynb b/notebooks/generate-text/How-to-Control-Text-Generation-With-DeepSparse.ipynb
similarity index 99%
rename from notebooks/generate-text/generate.ipynb
rename to notebooks/generate-text/How-to-Control-Text-Generation-With-DeepSparse.ipynb
index f56c496..279f753 100644
--- a/notebooks/generate-text/generate.ipynb
+++ b/notebooks/generate-text/How-to-Control-Text-Generation-With-DeepSparse.ipynb
@@ -22,6 +22,13 @@
         "pip install deepsparse-nightly[llm] langchain sentence-transformers chromadb datasets"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": []
+    },
     {
       "cell_type": "code",
       "execution_count": null,
diff --git a/notebooks/generate-text/readme.md b/notebooks/generate-text/readme.md
index 5f6fca9..353ee00 100644
--- a/notebooks/generate-text/readme.md
+++ b/notebooks/generate-text/readme.md
@@ -1,6 +1,6 @@
 # How to Control Text Generation with DeepSparse
 
-[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neuralmagic/notebooks/blob/main/notebooks/generate-text/generate.ipynb)
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/neuralmagic/notebooks/blob/main/notebooks/generate-text/How-to-Control-Text-Generation-With-DeepSparse.ipynb)
 
 This notebook walks through different strategies for generating text using DeepSparse on CPUs.