From 58b3d49ec225f075e2895cd4e8251c38a06ce57a Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Mon, 19 Jun 2023 00:26:29 +0200
Subject: [PATCH 01/32] Remove obsolete test code

---
 cuda_test/compile.sh |   7 -
 cuda_test/test.cu    | 339 -------------------------------------------
 2 files changed, 346 deletions(-)
 delete mode 100755 cuda_test/compile.sh
 delete mode 100644 cuda_test/test.cu
diff --git a/cuda_test/compile.sh b/cuda_test/compile.sh
deleted file mode 100755
index 3f04d83b..00000000
--- a/cuda_test/compile.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-/opt/cuda/bin/nvcc -isystem /opt/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 --compiler-options '-fPIC' -std=c++17 \
-test.cu \
-../exllama_ext/q4v2_mlp.cu \
-../exllama_ext/rms_norm.cu \
-../exllama_ext/q4v2_matmul.cu \
-../exllama_ext/column_remap.cu \
--o ./test
diff --git a/cuda_test/test.cu b/cuda_test/test.cu
deleted file mode 100644
index 47a2e69b..00000000
--- a/cuda_test/test.cu
+++ /dev/null
@@ -1,339 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <chrono>
-#include <iostream>
-#include <sstream>
-#include <locale>
-#include <iomanip>
-#include <limits>
-
-#include "../exllama_ext/util.h"
-#include "../exllama_ext/matrix.h"
-#include "../exllama_ext/q4v2_mlp.h"
-
-using namespace std;
-
-template <typename T>
-class Tensor
-{
-public:
-    T* data_cuda;
-    T* data_cpu;
-    uint32_t height;
-    uint32_t width;
-
-    // From file
-
-    Tensor(const char* filename)
-    {
-        FILE* file = fopen(filename, "rb");
-        if (!file)
-        {
-            cout << "File not found: " << filename << "\n";
-            return;
-        }
-
-        fseek(file, 0, SEEK_END);
-        long size = ftell(file);
-        fseek(file, 0, SEEK_SET);
-        size_t num_elements = size / sizeof(T);
-
-        if (num_elements == 0)
-        {
-            data_cuda = NULL;
-            data_cpu = NULL;
-            height = 0;
-            width = 0;
-
-            cout << " ** " << filename << " (None)\n";
-            return;
-        }
-
-        data_cpu = new T[num_elements];
-        fread(data_cpu, sizeof(T), num_elements, file);
-        fclose(file);
-
-        char filenameshape[1024];
-        strcpy(filenameshape, filename);
-        strcat(filenameshape, ".shape");
-
-        file = fopen(filenameshape, "rb");
-        if (!file)
-        {
-            cout << "File not found: " << filenameshape << "\n";
-            return;
-        }
-
-        fread(&height, 1, sizeof(uint32_t), file);
-        fread(&width, 1, sizeof(uint32_t), file);
-        fclose(file);
-
-        if (width * height != num_elements)
-        {
-            cout << "Incorrect shape: " << filenameshape << "\n";
-            return;
-        }
-
-        cudaMalloc(&data_cuda, size);
-        dataToCUDA();
-
-        cout << " ** " << filename << " (" << height << ", " << width << ")\n";
-    }
-
-    // Empty tensor
-
-    Tensor(int _height, int _width)
-    {
-        height = _height;
-        width = _width;
-
-        size_t size = (height * width) * sizeof(T);
-        cudaMalloc(&data_cuda, size);
-
-        data_cpu = new T[height * width];
-    }
-
-    // Zero tensor
-
-    Tensor(int _height, int _width, T zero_value)
-    {
-        height = _height;
-        width = _width;
-
-        size_t size = (height * width) * sizeof(T);
-        cudaMalloc(&data_cuda, size);
-
-        data_cpu = new T[height * width];
-
-        for (int i = 0; i < _width * _height; i++) data_cpu[i] = zero_value;
-        dataToCUDA();
-    }
-
-    // Fill
-
-    void fill(T value)
-    {
-        for (int i = 0; i < width * height; i++) data_cpu[i] = value;
-        dataToCUDA();
-    }
-
-    // Copy data
-
-    void dataToCUDA()
-    {
-        size_t size = (height * width) * sizeof(T);
-        cudaMemcpy(data_cuda, data_cpu, size, cudaMemcpyHostToDevice);
-        cudaDeviceSynchronize();
-    }
-
-    void dataToCPU()
-    {
-        size_t size = (height * width) * sizeof(T);
-        cudaMemcpy(data_cpu, data_cuda, size, cudaMemcpyDeviceToHost);
-        cudaDeviceSynchronize();
-    }
-
-};
-
-__global__ void dummyKernel()
-{
-    // Dummy kernel
-}
-
-void warmUpCUDA()
-{
-    // Create a CUDA context
-    cudaFree(0);
-
-    // Launch a dummy kernel
-    dummyKernel<<<1, 1>>>();
-    cudaDeviceSynchronize();
-}
-
-
-template <typename T>
-float compareTensors(Tensor<T>& a, Tensor<T>& b, int height = 0, int width = 0)
-{
-    if (height == 0 && (a.width != b.width || a.height != b.height))
-    {
-        cout << "Incompatible sizes.\n";
-        return std::numeric_limits<float>::infinity();
-    }
-
-    if (height == 0) height = a.height;
-    if (width == 0) width = a.width;
-
-    a.dataToCPU();
-    b.dataToCPU();
-
-    float m = 0.0f;
-
-    for (int r = 0; r < height; r++)
-    {
-        for (int c = 0; c < width; c++)
-        {
-            float a_f = __half2float(a.data_cpu[r * a.width + c]);
-            float b_f = __half2float(b.data_cpu[r * b.width + c]);
-            m = fmax(m, fabs(a_f - b_f));
-        }
-    }
-
-    return m;
-}
-
-void printTensor(Tensor<half>& a)
-{
-    int width = 8; if (width > a.width) width = a.width;
-    int height = 8; if (height > a.height) height = a.height;
-
-    a.dataToCPU();
-
-    for (int c = 0; c < width; c++) cout << "---------";
-    cout << "\n";
-
-    for (int r = 0; r < height; r++)
-    {
-        for (int c = 0; c < width; c++)
-        {
-            float a_f = __half2float(a.data_cpu[r * a.width + c]);
-            cout << setfill(' ') << setprecision(5) << setw(9) << a_f << dec;
-        }
-        cout << "\n";
-    }
-}
-
-void printTensor(Tensor<uint32_t>& a)
-{
-    int width = 8; if (width > a.width) width = a.width;
-    int height = 8; if (height > a.height) height = a.height;
-
-    a.dataToCPU();
-
-    for (int c = 0; c < width; c++) cout << "---------";
-    cout << "\n";
-
-    for (int r = 0; r < height; r++)
-    {
-        for (int c = 0; c < width; c++)
-        {
-            uint32_t a_i = a.data_cpu[r * a.width + c];
-            cout << " " << setfill('0') << setw(8) << hex << a_i << dec << setfill(' ');
-        }
-        cout << "\n";
-    }
-}
-
-
-int main()
-{
-    warmUpCUDA();
-    int iters;
-
-    cout << fixed << setprecision(6);
-    cout << "Loading tensors...\n";
-
-    // Test MLP
-
-    Tensor<half>        x                    ("mlp/test_mlp_x");
-    Tensor<half>        x_gated              ("mlp/test_mlp_x_gated");
-    Tensor<half>        x_done               ("mlp/test_mlp_x_done");
-    Tensor<half>        x_prenorm            ("mlp/test_mlp_x_prenorm");
-    Tensor<half>        x_postresidual       ("mlp/test_mlp_x_postresidual");
-
-    Tensor<half>        rms_norm_weight      ("mlp/test_mlp_norm_weight");
-
-    Tensor<half>        up_proj_bias         ("mlp/up_proj.bias");
-    Tensor<uint32_t>    up_proj_qweight      ("mlp/up_proj.qweight");
-    Tensor<uint32_t>    up_proj_qzeros       ("mlp/up_proj.qzeros");
-    Tensor<half>        up_proj_scales       ("mlp/up_proj.scales");
-    Tensor<uint16_t>    up_proj_seq_g_idx    ("mlp/up_proj.seq_g_idx");
-    Tensor<uint32_t>    up_proj_x_map        ("mlp/up_proj.x_map");
-
-    Tensor<half>        gate_proj_bias       ("mlp/gate_proj.bias");
-    Tensor<uint32_t>    gate_proj_qweight    ("mlp/gate_proj.qweight");
-    Tensor<uint32_t>    gate_proj_qzeros     ("mlp/gate_proj.qzeros");
-    Tensor<half>        gate_proj_scales     ("mlp/gate_proj.scales");
-    Tensor<uint16_t>    gate_proj_seq_g_idx  ("mlp/gate_proj.seq_g_idx");
-    Tensor<uint32_t>    gate_proj_x_map      ("mlp/gate_proj.x_map");
-
-    Tensor<half>        down_proj_bias       ("mlp/down_proj.bias");
-    Tensor<uint32_t>    down_proj_qweight    ("mlp/down_proj.qweight");
-    Tensor<uint32_t>    down_proj_qzeros     ("mlp/down_proj.qzeros");
-    Tensor<half>        down_proj_scales     ("mlp/down_proj.scales");
-    Tensor<uint16_t>    down_proj_seq_g_idx  ("mlp/down_proj.seq_g_idx");
-    Tensor<uint32_t>    down_proj_x_map      ("mlp/down_proj.x_map");
-
-    Tensor<half> x_temp(x.height, x.width);
-    Tensor<float> x_col_temp(1, x.height);
-    Tensor<half> x_act_temp(x.height, gate_proj_qweight.width);
-
-    Tensor<half> out(x_gated.height, x_gated.width);
-
-    int groupsize = gate_proj_qweight.height * 8 / gate_proj_qzeros.height;
-
-    iters = 1;
-    auto start_time = chrono::high_resolution_clock::now();
-
-    cout << "--------\n";
-
-    cout << "Fused MLP (" << iters << " iterations)... ";
-
-    for (int i = 0; i < iters; i++)
-    {
-        q4v2_mlp_cuda
-        (
-            x_prenorm.data_cuda,            // input
-
-            x_temp.data_cuda,               // input, normalized (empty)
-            x_col_temp.data_cuda,           // temp for norm (empty)
-            x_act_temp.data_cuda,           // temp for act(x @ gate) * x @ up (empty)
-
-            rms_norm_weight.data_cuda,
-            (1e-06),
-
-            gate_proj_qweight.data_cuda,
-            gate_proj_scales.data_cuda,
-            gate_proj_qzeros.data_cuda,
-            gate_proj_seq_g_idx.data_cuda,
-            gate_proj_x_map.data_cuda,
-
-            up_proj_qweight.data_cuda,
-            up_proj_scales.data_cuda,
-            up_proj_qzeros.data_cuda,
-            up_proj_seq_g_idx.data_cuda,
-            up_proj_x_map.data_cuda,
-
-            down_proj_qweight.data_cuda,
-            down_proj_scales.data_cuda,
-            down_proj_qzeros.data_cuda,
-            down_proj_seq_g_idx.data_cuda,
-            down_proj_x_map.data_cuda,
-
-            x.height,
-            x.width,
-            gate_proj_qweight.width,
-            groupsize
-        );
-    }
-
-    cudaDeviceSynchronize();
-
-    auto end_time = chrono::high_resolution_clock::now();
-    auto duration = chrono::duration_cast<chrono::microseconds>(end_time - start_time).count();
-    duration /= iters;
-    cout << duration << " us / iteration\n";
-
-    cout << "Validating fused MLP... ";
-
-    float diff = compareTensors<half>(x_prenorm, x_postresidual);
-
-    cout << "max diff.: " << diff <<"\n";
-
-    printTensor(x_prenorm);
-    printTensor(x_postresidual);
-
-    printf("Done\n");
-    return 0;
-}

From 5565d6dc0db27334d47ea9c3a1b3a50f9e31b8e6 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Mon, 19 Jun 2023 22:40:19 +0200
Subject: [PATCH 02/32] Add function to free unmanaged resources

---
 exllama_ext/cuda_buffers.cu        | 33 ++++++++++++++----------------
 exllama_ext/cuda_buffers.cuh       |  2 ++
 exllama_ext/cuda_func/q4_matrix.cu |  1 +
 exllama_ext/exllama_ext.cpp        | 11 ++++++++++
 model.py                           | 10 +++++++++
 sh/test_chatbot.sh                 | 18 ----------------
 6 files changed, 39 insertions(+), 36 deletions(-)
 delete mode 100755 sh/test_chatbot.sh

diff --git a/exllama_ext/cuda_buffers.cu b/exllama_ext/cuda_buffers.cu
index 7e7bd75d..73146958 100644
--- a/exllama_ext/cuda_buffers.cu
+++ b/exllama_ext/cuda_buffers.cu
@@ -35,6 +35,12 @@ CudaBuffers::CudaBuffers
 
 CudaBuffers::~CudaBuffers()
 {
+    cudaStreamDestroy(alt_stream_1);
+    cudaStreamDestroy(alt_stream_2);
+    cudaStreamDestroy(alt_stream_3);
+    cudaEventDestroy(alt_stream_1_done);
+    cudaEventDestroy(alt_stream_2_done);
+    cudaEventDestroy(alt_stream_3_done);
 }
 
 float* CudaBuffers::get_zeros_float(const int num_zeros)
@@ -76,23 +82,14 @@ void prepare_buffers_cuda
     );
 
     g_buffers[_device] = buffers;
+}
 
-//     if (!q4_table_init)
-//     {
-//         for (uint v_zero = 0; v_zero < 16; v_zero++)
-//         {
-//             for (uint v_read = 0; v_read < 256; v_read++)
-//             {
-//                 half v_0 = __float2half((float)((int)((v_read      ) & 0x0f) - v_zero - 1));
-//                 half v_1 = __float2half((float)((int)((v_read >>  4) & 0x0f) - v_zero - 1));
-//                 half2 v_01 = {v_0, v_1};
-//                 q4_table_host[v_zero][v_read] = v_01;
-//             }
-//         }
-//         q4_table_init = true;
-//     }
-//
-//     cudaSetDevice(_device);
-//     cudaMemcpyToSymbol(q4_table, q4_table_host, 16 * 256 * sizeof(half2));
-//     cudaDeviceSynchronize();
+void cleanup_buffers_cuda()
+{
+    for (int i = 0; i < CUDA_MAX_DEVICES; i++)
+    {
+        if (!g_buffers[i]) continue;
+        delete g_buffers[i];
+        g_buffers[i] = NULL;
+    }
 }
diff --git a/exllama_ext/cuda_buffers.cuh b/exllama_ext/cuda_buffers.cuh
index 86e8af82..8b3d451d 100644
--- a/exllama_ext/cuda_buffers.cuh
+++ b/exllama_ext/cuda_buffers.cuh
@@ -58,4 +58,6 @@ void prepare_buffers_cuda
     int _max_zeros_float
 );
 
+void cleanup_buffers_cuda();
+
 #endif
\ No newline at end of file
diff --git a/exllama_ext/cuda_func/q4_matrix.cu b/exllama_ext/cuda_func/q4_matrix.cu
index e856a978..d02fa0ec 100644
--- a/exllama_ext/cuda_func/q4_matrix.cu
+++ b/exllama_ext/cuda_func/q4_matrix.cu
@@ -20,6 +20,7 @@ void g_q4_keep_matrix(Q4Matrix* m)
 void g_q4_free_matrices()
 {
     for (const auto& m : g_q4_matrices) delete m;
+    g_q4_matrices.clear();
 }
 
 Q4Matrix::Q4Matrix
diff --git a/exllama_ext/exllama_ext.cpp b/exllama_ext/exllama_ext.cpp
index 280cf79c..5a7f3419 100644
--- a/exllama_ext/exllama_ext.cpp
+++ b/exllama_ext/exllama_ext.cpp
@@ -110,6 +110,16 @@ void set_tuning_params
     tuningParams.concurrent_streams = concurrent_streams;
 }
 
+
+// Release all unmanaged objects allocated by the extension
+
+void cleanup()
+{
+    cleanup_buffers_cuda();
+    g_q4_free_matrices();
+}
+
+
 // Prepare buffers for forward pass
 
 void prepare_buffers
@@ -688,6 +698,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("set_tuning_params", &set_tuning_params, "set_tuning_params");
     m.def("prepare_buffers", &prepare_buffers, "prepare_buffers");
+    m.def("cleanup", &cleanup, "cleanup");
     m.def("make_q4", &make_q4, "make_q4");
     m.def("q4_matmul", &q4_matmul, "q4_matmul");
     m.def("q4_matmul_lora", &q4_matmul_lora, "q4_matmul_lora");
diff --git a/model.py b/model.py
index 548af453..06766ca2 100644
--- a/model.py
+++ b/model.py
@@ -863,3 +863,13 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
             logits = logits.float()
             logits = _move_tensor(logits, self.config.device_map.embed_tokens, "logits", self.config)
             return logits
+
+
+    # Free unmanaged resources allocated by the C++ extension. Call this before dereferencing the ExLlama object,
+    # e.g. if you intend to create a new instance to load another model, but don't call it in a destructor that wraps
+    # the object, since it relies on CUDA function calls and the CUDA context is one of the first things to go when
+    # a PyTorch application terminates, before other managed objects are destroyed.
+
+    def free_unmanaged(self):
+
+        cuda_ext.exllama_ext.cleanup()
\ No newline at end of file
diff --git a/sh/test_chatbot.sh b/sh/test_chatbot.sh
deleted file mode 100755
index 2595c58d..00000000
--- a/sh/test_chatbot.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-
-python test_chatbot.py -d /mnt/str/models/wizardlm-30b-uncensored-4bit-act-order/ -un "Maxine" -p prompt_assistant.txt -nnl -temp 1.0 -topp .75
-
-#python test_chatbot.py \
-#-t /mnt/str/models/bluemoon-4k-13b-4bit-128g/tokenizer.model \
-#-c /mnt/str/models/bluemoon-4k-13b-4bit-128g/config.json \
-#-m /mnt/str/models/bluemoon-4k-13b-4bit-128g/bluemoonrp-13b-4k-epoch6-4bit-128g.safetensors \
-#-p prompt_bluemoon.txt \
-#-un "Player" \
-#-bn "DM" \
-#-bf \
-#-topk 30 \
-#-topp 0.45 \
-#-minp 0.1 \
-#-temp 1.4 \
-#-repp 1.3 \
-#-repps 256 \
-#-l 4096
\ No newline at end of file

From 7673cced05788acc7a46f5400a868bd6dec23351 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Mon, 19 Jun 2023 23:29:12 +0200
Subject: [PATCH 03/32] Add waitress to web UI server

---
 README.md    | 29 ++++++++++-------------------
 webui/app.py |  3 ++-
 2 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index d73f5b65..8289bd23 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,11 @@ incompatibilities with older cards.
 * `safetensors` 0.3.1
 * `sentencepiece`
 * `ninja`
-* `flask` (only for the web UI)
+
+Additionally, only for the web UI:
+
+* `flask`
+* `waitress`
 
 ## Linux/WSL prerequisites
 
@@ -30,7 +34,7 @@ To run on Windows (without WSL):
 Studio 2022` IDE, or alternatively just the `Build Tools for Visual Studio 2022` package (make sure `Desktop
 development with C++` is ticked in the installer), it doesn't really matter which.
 2. Install the appropriate version of [PyTorch](https://pytorch.org/get-started/locally/), choosing one of the CUDA
-versions. I am developing on the nightly build, but the stable version should also work.
+versions. I am developing on the nightly build, but the stable version (2.0.1) should also work.
 3. Install CUDA Toolkit, ([11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) and 
 [11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive) both seem to work, just make sure to match PyTorch's
 Compute Platform version).
@@ -65,11 +69,12 @@ multibot mode:
 
 To run it:
 
-    pip install flask
+    pip install flask waitress
 
     python webui/app.py -d <path_to_model_files>
 
-Note that sessions are stored in `~/exllama_sessions/`. 
+Note that sessions are stored in `~/exllama_sessions/`. You can change the location of the sessions storage with `-sd`
+if you want.
 
 ## Docker
 For security benefits and easier deployment, it is also possible to run the web UI in an isolated docker container. Note: the docker image currently only supports NVIDIA GPUs.
@@ -179,20 +184,6 @@ confirmed to be working right now.
 
 ## Recent updates
 
-**2023-05-24**: Added fused rotary embeddings and some minor optimizations. 13% faster on 7B, 9% on 13B. Small
-improvement on larger models. Added best-case scores to benchmark results and some clarification. For easier
-comparisons to other implementations, or whatever.
-
-**2023-05-27**: Better memory management in CUDA. Introduced auto switch between Torch's SDP backend and regular 
-matmul attention with some tweaks. Finished CUDA MLP. All in all about 10% faster with these updates.
-
-**2023-05-29**: Web UI is _almost_ up and running. Having to learn JavaScript, and it turns out I hate JavaScript. But
-ChatGPT is an incredible resource for learning new languages, I gotta say, so it's not as painful as it could have
-been. Anyway, in the process of working with the UI I discovered I've been measuring prompt speed incorrectly. Either
-Torch or CUDA or the GPU driver does some sort of caching or self-calibration or lazy initialization during the first
-pass through the model, so subsequent passes are actually _way_ faster than what I've been recording. Doesn't do much
-for individual tokens, but benchmarks updated anyway. Closing in on 10k tokens/second for 7B. (!)
-
 **2023-06-02**: Web UI is now in a fairly working state. Expect it to be a little scuffed in places. There will be a
 rewrite at some point to make the client-side code less seizure-inducing. It has multibot mode, chat rewind and editing
 features, sessions, and more. I'm going to build it out with support for instruct prompting and such, in time.
@@ -216,5 +207,5 @@ disabled by default. YMMV. Use `-cs` to try it out.
 **2023-06-17**: Fixed a nasty bug in the fused attention that was causing slightly incorrect cache states on 13B and
 33B models. You definitely want to update.
 
-**2023-06-18**: LoRA support now. Still needs a lot of testing and som optimization, and currently you can't stack
+**2023-06-18**: LoRA support now. Still needs a lot of testing and some optimization, and currently you can't stack
 multiple LoRAs during the same inference. There's also no support in the web UI yet.
\ No newline at end of file
diff --git a/webui/app.py b/webui/app.py
index f921cf33..dec2f09d 100644
--- a/webui/app.py
+++ b/webui/app.py
@@ -11,6 +11,7 @@
 import argparse
 from tokenizer import ExLlamaTokenizer
 from model import ExLlama, ExLlamaConfig
+from waitress import serve
 
 app = Flask(__name__)
 app.static_folder = 'static'
@@ -155,4 +156,4 @@ def api_userinput():
 if host == "localhost":
     Timer(1, lambda: webbrowser.open(f'http://{machine}/')).start()
 
-app.run(host = host, port = port)
\ No newline at end of file
+serve(app, host = host, port = port)
\ No newline at end of file

From 70c1ba3ce3c702adc703820f02ee86b650238e9d Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Mon, 19 Jun 2023 23:45:18 +0200
Subject: [PATCH 04/32] Add endpoint allowing client to append arbitrary block
 to session

---
 webui/app.py     |  5 +++++
 webui/session.py | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/webui/app.py b/webui/app.py
index dec2f09d..704e52f9 100644
--- a/webui/app.py
+++ b/webui/app.py
@@ -118,6 +118,11 @@ def api_userinput():
         result = Response(stream_with_context(session.respond_multi(user_input)), mimetype = 'application/json')
         return result
 
+@app.route("/api/append_block", methods=['POST'])
+def api_append_block():
+    data = request.get_json()
+    session.api_append_block(data)
+    return json.dumps({"result": "ok"}) + "\n"
 
 # Load the model
 
diff --git a/webui/session.py b/webui/session.py
index 4b07993f..1ecf49c3 100644
--- a/webui/session.py
+++ b/webui/session.py
@@ -336,6 +336,22 @@ def api_edit_block(self, data):
                 return
 
 
+    def api_append_block(self, data):
+
+        author = None
+        if "author" in data:
+            author = data["author"]
+        else:
+            if len(self.participants) > 0:
+                author = self.participants[0]
+
+        text = data["text"].strip()
+
+        newNode = Node(text, author)
+        self.history.append(newNode)
+        self.save()
+
+
     def api_set_participants(self, data):
 
         self.participants = data["participants"]

From 2af7a627c663cf22fa36fea19ba4b8b6a0f302a9 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Tue, 20 Jun 2023 18:09:25 +0200
Subject: [PATCH 05/32] Add waitress requirement to Dockerfile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index b7dd15ce..393b0f6b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,7 +16,7 @@ WORKDIR /app
 
 RUN pip install --upgrade pip setuptools wheel \
     && pip install -r requirements.txt \
-    && pip install flask==2.3.2
+    && pip install flask==2.3.2 waitress
 
 USER root
 

From 41ad603132eb8532033aff4a0e4298f0e5a01d1a Mon Sep 17 00:00:00 2001
From: Allen Benz <allenbenz@users.noreply.github.com>
Date: Tue, 20 Jun 2023 13:01:12 -0700
Subject: [PATCH 06/32] Suppress unnecessary where output entirely. (#84)

Put helpful but verbose injected compiler path message behind verbose flag
Write the failed to find cl to stderr as compilation would fail.
---
 cuda_ext.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cuda_ext.py b/cuda_ext.py
index 91b4397b..251ba46e 100644
--- a/cuda_ext.py
+++ b/cuda_ext.py
@@ -30,14 +30,15 @@ def find_msvc():
     
     import subprocess
     try:
-        subprocess.check_output(["where", "cl"])
+        subprocess.check_output(["where", "/Q", "cl"])
     except subprocess.CalledProcessError as e:
         cl_path = find_msvc()
         if cl_path:
-            print("Injected compiler path:", cl_path)
+            if verbose:
+                print("Injected compiler path:", cl_path)
             os.environ["path"] += ";" + cl_path
         else:
-            print("Unable to find cl.exe; compilation will probably fail.")
+            print("Unable to find cl.exe; compilation will probably fail.", file=sys.stderr)
 
 exllama_ext = load(
     name = extension_name,

From f60732405f4714d0b4c90c6c4031ae13c012e876 Mon Sep 17 00:00:00 2001
From: Allen Benz <allenbenz@users.noreply.github.com>
Date: Tue, 20 Jun 2023 13:05:45 -0700
Subject: [PATCH 07/32] elapsed can be 0 somehow, sometimes. (#86)

---
 webui/session.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/webui/session.py b/webui/session.py
index 1ecf49c3..a4a80f1a 100644
--- a/webui/session.py
+++ b/webui/session.py
@@ -615,7 +615,8 @@ def respond_multi(self, user_input):
             end_time = time.time()
             elapsed = end_time - begin_time
             new_tokens = context.shape[-1] - reused
-            print(f"Prompt processed in {elapsed:.2f} seconds, {new_tokens} new tokens, {(new_tokens / elapsed):.2f} tokens/second:")
+            token_rate = 0 if elapsed == 0 else (new_tokens / elapsed)
+            print(f"Prompt processed in {elapsed:.2f} seconds, {new_tokens} new tokens, {token_rate:.2f} tokens/second:")
 
         begin_time = time.time()
         total_tokens = [0]
@@ -692,8 +693,9 @@ def respond_multi(self, user_input):
 
         end_time = time.time()
         elapsed = end_time - begin_time
+        token_rate = 0 if elapsed == 0 else (total_tokens[0] / elapsed)
 
-        print(f"Response generated in {elapsed:.2} seconds, {total_tokens[0]} tokens, {(total_tokens[0] / elapsed):.2f} tokens/second:")
+        print(f"Response generated in {elapsed:.2} seconds, {total_tokens[0]} tokens, {token_rate:.2f} tokens/second:")
 
         self.save()
 

From 9192c3fdb861740144500dc2758740785df2d8de Mon Sep 17 00:00:00 2001
From: nikuya3 <54780682+nikuya3@users.noreply.github.com>
Date: Tue, 20 Jun 2023 20:37:45 +0000
Subject: [PATCH 08/32] Add option to run docker container as root user (#83)

* Add option to run docker container as root user

* Separate web requirements
---
 .env                 |  5 +++--
 Dockerfile           | 23 ++++++++++++++++-------
 README.md            | 13 ++++++++++---
 docker-compose.yml   |  9 +++++----
 entrypoint.sh        | 18 +++++++++++-------
 requirements-web.txt |  2 ++
 6 files changed, 47 insertions(+), 23 deletions(-)
 create mode 100644 requirements-web.txt

diff --git a/.env b/.env
index 0b6f65f3..3ab8b5a3 100644
--- a/.env
+++ b/.env
@@ -1,4 +1,5 @@
 PORT=5000
+RUN_UID=1000  # set to 0 to run the service as root inside the container
+APPLICATION_STATE_PATH=/data  # path to the directory holding application state inside the container
 MODEL_PATH=models/LLaMA-7B-4bit-128g  # replace with the actual model path on the host
-CONTAINER_MODEL_PATH=/app/model
-SESSIONS_PATH=./exllama_sessions
+SESSIONS_PATH=~/exllama_sessions  # replace with the actual directory on the host where chat sessions should be stored
diff --git a/Dockerfile b/Dockerfile
index 393b0f6b..829e9dad 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,22 +1,31 @@
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 as build
-
-ENV RUN_UID=1000
+ARG RUN_UID="1000" \
+    APPLICATION_STATE_PATH="/data"
+ENV RUN_UID=$RUN_UID \
+    APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH \
+    CONTAINER_MODEL_PATH=$APPLICATION_STATE_PATH/model \
+    CONTAINER_SESSIONS_PATH=$APPLICATION_STATE_PATH/exllama_sessions
 
 RUN apt-get update && \
     DEBIAN_FRONTEND=noninteractive apt-get install -y ninja-build python3 python3-pip && \
     rm -rf /var/lib/apt/lists/*
 
-# Setup user which will run the service
-RUN useradd -m -u $RUN_UID user
-USER user
+# Setup user which will run the service and create application state directory
+RUN if [ ${RUN_UID} -ne 0 ] ; then useradd -m -u $RUN_UID user ; fi \
+    && mkdir -p $APPLICATION_STATE_PATH \
+    && mkdir -p $CONTAINER_MODEL_PATH \
+    && mkdir -p $CONTAINER_SESSIONS_PATH \
+    && chown -R $RUN_UID $APPLICATION_STATE_PATH
+USER $RUN_UID
 
-COPY --chown=user . /app
+COPY --chown=$RUN_UID . /app
 
 WORKDIR /app
 
+# Create application state directory and install python packages
 RUN pip install --upgrade pip setuptools wheel \
     && pip install -r requirements.txt \
-    && pip install flask==2.3.2 waitress
+    && pip install -r requirements-web.txt
 
 USER root
 
diff --git a/README.md b/README.md
index 8289bd23..391da5d6 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ incompatibilities with older cards.
 
 ## Dependencies
 
+* Python 3.9 or newer
 * `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118
 * `safetensors` 0.3.1
 * `sentencepiece`
@@ -96,7 +97,13 @@ docker compose build
 It is also possible to manually build the image:
 
 ```
-docker build -t exllama-web
+docker build -t exllama-web .
+```
+
+NOTE: by default, the service inside the docker container is run by a non-root user. Hence, the ownership of bind-mounted directories (`/data/model` and `/data/exllama_sessions` in the default `docker-compose.yml` file) is changed to this non-root user in the container entrypoint (`entrypoint.sh`). To disable this, set `RUN_UID=0` in the `.env` file if using `docker compose`, or the following command if you manually build the image:
+
+```
+docker build -t exllama-web --build-arg RUN_UID=0 .
 ```
 
 ### Run
@@ -114,7 +121,7 @@ The configuration can be viewed in `docker-compose.yml` and changed by creating
 Run manually: 
 
 ```
-docker run --gpus all -p 5000:5000 -v <path_to_model_files>:/app/model/ --rm -it exllama-web --host 0.0.0.0:5000
+docker run --gpus all -p 5000:5000 -v <path_to_model_dir>:/data/model/ -v <path_to_session_dir>:/data/exllama_sessions --rm -it exllama-web --host 0.0.0.0:5000
 ```
 
 
@@ -208,4 +215,4 @@ disabled by default. YMMV. Use `-cs` to try it out.
 33B models. You definitely want to update.
 
 **2023-06-18**: LoRA support now. Still needs a lot of testing and some optimization, and currently you can't stack
-multiple LoRAs during the same inference. There's also no support in the web UI yet.
\ No newline at end of file
+multiple LoRAs during the same inference. There's also no support in the web UI yet.
diff --git a/docker-compose.yml b/docker-compose.yml
index 5e47c1c2..369b6864 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,15 +4,16 @@ services:
   web:
     build:
       context: .
+      args:
+        - RUN_UID=$RUN_UID
+        - APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH
     command: |
       --host 0.0.0.0:$PORT
     env_file:
       - .env
-    environment:
-      - CONTAINER_MODEL_PATH=$CONTAINER_MODEL_PATH
     volumes:
-      - $MODEL_PATH:$CONTAINER_MODEL_PATH
-      - $SESSIONS_PATH:/home/user/exllama_sessions
+      - $MODEL_PATH:$APPLICATION_STATE_PATH/model
+      - $SESSIONS_PATH:$APPLICATION_STATE_PATH/exllama_sessions
     ports:
       - "$PORT:$PORT"
     tmpfs:
diff --git a/entrypoint.sh b/entrypoint.sh
index 79864aa8..c03bef7c 100755
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,15 +1,19 @@
 #!/usr/bin/env bash
 set -Eeuo pipefail
 
-# Ensure that the model path is set
-if [ -z $CONTAINER_MODEL_PATH ]; then
-  echo "Must specify model path"
+# Ensure that the application state path is set
+if [ -z $APPLICATION_STATE_PATH ]; then
+  echo "Must specify application state path"
   exit 1
 fi
 
-# Ensure that bind-mounted directories are owned by the user that runs the service
-chown -R $RUN_UID:$RUN_UID $CONTAINER_MODEL_PATH
-chown -R $RUN_UID:$RUN_UID /home/user/exllama_sessions
+# Ensure that bind-mounted directories are owned by the user that runs the service if the user is not root
+if [ $RUN_UID -ne 0 ]; then
+  chown -R $RUN_UID:$RUN_UID $APPLICATION_STATE_PATH
+fi
 
 # Run service as specified (non-root) user
-exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py -d $CONTAINER_MODEL_PATH $@
+exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py \
+	-d $CONTAINER_MODEL_PATH \
+	--sessions-dir $CONTAINER_SESSIONS_PATH \
+	$@
diff --git a/requirements-web.txt b/requirements-web.txt
new file mode 100644
index 00000000..d1bf2563
--- /dev/null
+++ b/requirements-web.txt
@@ -0,0 +1,2 @@
+flask==2.3.2
+waitress==2.1.2

From d923af454b384b6c012807d4963c3cd22158971d Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Wed, 21 Jun 2023 23:43:12 +0200
Subject: [PATCH 09/32] Update install instructions

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 391da5d6..e5b0074d 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ Compute Platform version).
 
 Install dependencies, clone repo and run benchmark:
 
-    pip install safetensors sentencepiece ninja
+    pip install -r requirements.txt
 
     git clone https://github.com/turboderp/exllama
     cd exllama
@@ -70,7 +70,7 @@ multibot mode:
 
 To run it:
 
-    pip install flask waitress
+    pip install -r requirements-web.txt
 
     python webui/app.py -d <path_to_model_files>
 

From b29960fe8c97796d6363182c7ea302b735b409e4 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Wed, 21 Jun 2023 23:43:33 +0200
Subject: [PATCH 10/32] Add Python version check and warning message

---
 example_flask.py | 2 +-
 lora.py          | 2 +-
 model.py         | 7 +++++++
 webui/app.py     | 2 +-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/example_flask.py b/example_flask.py
index 58aca5fe..d83a45b4 100644
--- a/example_flask.py
+++ b/example_flask.py
@@ -1,5 +1,5 @@
-from flask import Flask, request
 from model import ExLlama, ExLlamaCache, ExLlamaConfig
+from flask import Flask, request
 from tokenizer import ExLlamaTokenizer
 from generator import ExLlamaGenerator
 import os, glob
diff --git a/lora.py b/lora.py
index cd789ca8..53106548 100644
--- a/lora.py
+++ b/lora.py
@@ -1,8 +1,8 @@
+from model import ExLlamaConfig, Ex4bitLinear
 import torch
 import json
 from safetensors.torch import load_file as safe_load_file
 from torch import load as load_file
-from model import ExLlamaConfig, Ex4bitLinear
 
 class ExLlamaLora:
 
diff --git a/model.py b/model.py
index 06766ca2..1c536f28 100644
--- a/model.py
+++ b/model.py
@@ -1,3 +1,10 @@
+import sys
+min_version = (3, 9)
+if sys.version_info < min_version:
+    print("")
+    print(f" ## Warning: this project requires Python {min_version[0]}.{min_version[1]} or higher.")
+    print("")
+
 import torch
 from torch import nn
 import torch.nn.functional as F
diff --git a/webui/app.py b/webui/app.py
index 704e52f9..276b26c5 100644
--- a/webui/app.py
+++ b/webui/app.py
@@ -1,6 +1,7 @@
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from model import ExLlama, ExLlamaConfig
 from flask import Flask, render_template, request, jsonify
 from flask import Response, stream_with_context
 from threading import Timer, Lock
@@ -10,7 +11,6 @@
 from session import prepare_sessions, get_initial_session, Session, load_session, new_session, _sessions_dir
 import argparse
 from tokenizer import ExLlamaTokenizer
-from model import ExLlama, ExLlamaConfig
 from waitress import serve
 
 app = Flask(__name__)

From 15035aa2ac686d883cc4b5350cc7152dd2bd893b Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Thu, 22 Jun 2023 12:43:46 +0200
Subject: [PATCH 11/32] Output logits to device of input IDs, allow mapping of
 embed_tokens

---
 model.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/model.py b/model.py
index 1c536f28..98df89b6 100644
--- a/model.py
+++ b/model.py
@@ -806,6 +806,7 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
 
             batch_size, seq_len = input_ids.shape
             past_len = cache.current_seq_len
+            output_device = input_ids.device
 
             buffer = ExLlamaBuffer(self.config)
 
@@ -829,7 +830,7 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
             # Embeddings
             # TODO: Allow passing input embeddings instead of IDs
 
-            input_ids = _move_tensor(input_ids, "cpu", "input_ids", self.config)
+            input_ids = _move_tensor(input_ids, self.config.device_map.embed_tokens, "input_ids", self.config)
             hidden_states = self.embed_tokens(input_ids)
 
             # Split buffers to devices
@@ -868,7 +869,7 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
             # logits = cuda_ext.matmul_half(hidden_states, self.lm_head_data, cublas = False)
 
             logits = logits.float()
-            logits = _move_tensor(logits, self.config.device_map.embed_tokens, "logits", self.config)
+            logits = _move_tensor(logits, output_device, "logits", self.config)
             return logits
 
 

From d953933b4518e268ac5743d101f8ee8c9ab23bc7 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Thu, 22 Jun 2023 12:50:05 +0200
Subject: [PATCH 12/32] Option to explicitly set output device

---
 model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model.py b/model.py
index 98df89b6..28a3e79e 100644
--- a/model.py
+++ b/model.py
@@ -798,7 +798,7 @@ def __init__(self, config):
         torch.cuda.empty_cache()
 
 
-    def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None):
+    def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None, output_device = None):
 
         # if torch.is_grad_enabled():
         #     raise ValueError("Forward pass called with gradients enabled. Back propagation is not supported yet.")
@@ -806,7 +806,7 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
 
             batch_size, seq_len = input_ids.shape
             past_len = cache.current_seq_len
-            output_device = input_ids.device
+            if output_device is None: output_device = input_ids.device
 
             buffer = ExLlamaBuffer(self.config)
 

From 58e9e69513e354d0e3eb3e2aac4e17020540b196 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Thu, 22 Jun 2023 16:14:27 +0200
Subject: [PATCH 13/32] Allow LoRA to ignore (empty) bias

---
 exllama_ext/cuda_func/q4_matmul.cu | 6 +++++-
 lora.py                            | 6 ++++++
 test_benchmark_inference.py        | 2 ++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/exllama_ext/cuda_func/q4_matmul.cu b/exllama_ext/cuda_func/q4_matmul.cu
index 04047617..fe2343f7 100644
--- a/exllama_ext/cuda_func/q4_matmul.cu
+++ b/exllama_ext/cuda_func/q4_matmul.cu
@@ -245,6 +245,10 @@ void q4_matmul_recons_cuda
 
     const half alpha = __float2half(1.0f);
     const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
-
     cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
+
+//     const float alpha = 1.0f;
+//     const float beta = no_zero ? 1.0f : 0.0f;
+//     cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
+//                 x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
 }
diff --git a/lora.py b/lora.py
index 53106548..10e84779 100644
--- a/lora.py
+++ b/lora.py
@@ -13,6 +13,7 @@ class ExLlamaLora:
     lora_scaling: float
     config: ExLlamaConfig
     tensors: dict[torch.tensor]
+    bias_ignored: bool
 
     def __init__(self, model, lora_config_path, lora_path):
 
@@ -21,6 +22,7 @@ def __init__(self, model, lora_config_path, lora_path):
         self.model = model
         self.config = model.config
         self.tensors = {}
+        self.bias_ignored = False
 
         # Grab relevant items from LoRA config
 
@@ -56,6 +58,10 @@ def __init__(self, model, lora_config_path, lora_path):
             decoder_layer = ks[4]
             lora_half = ks[5]
 
+            if lora_half == "bias":
+                self.bias_ignored = True
+                continue
+
             target_module = self.model.layers[decoder_idx]
             if decoder_part == "self_attn": target_module = target_module.self_attn
             elif decoder_part == "mlp": target_module = target_module.mlp
diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
index e3794611..f0ff059b 100644
--- a/test_benchmark_inference.py
+++ b/test_benchmark_inference.py
@@ -133,6 +133,8 @@ def mem(name, total = False):
         print(f" ## Error: please specify lora path to adapter_config.json")
         sys.exit()
     lora = ExLlamaLora(model, args.lora_config, args.lora)
+    if lora.bias_ignored:
+        print(f" !! Warning: LoRA bias ignored")
 
 # Test sequence
 

From 594fd1b4dbd65c335a5650578eb438d35f77353a Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Thu, 22 Jun 2023 19:23:23 +0200
Subject: [PATCH 14/32] Allow LoRA rank < 4

---
 exllama_ext/cuda_func/half_matmul.cu | 34 ++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/exllama_ext/cuda_func/half_matmul.cu b/exllama_ext/cuda_func/half_matmul.cu
index 6f62d9a4..76ee1e43 100644
--- a/exllama_ext/cuda_func/half_matmul.cu
+++ b/exllama_ext/cuda_func/half_matmul.cu
@@ -130,7 +130,7 @@ const int S_THREADS_X = 8;                                      // width
 const int S_THREADS_Z = 1;                                      // height
 const int S_BLOCKSIZE = MAX_DIM_SMALL / 1024 * S_THREADS_X;     // dim
 
-template<bool use_half2>
+template<bool use_half2, bool odd_rank>
 __global__ void half_matmul_small_kernel
 (
     const half* __restrict__ x,
@@ -163,7 +163,7 @@ __global__ void half_matmul_small_kernel
     const half* w_ptr = w_.item_ptr(k, column);
     half* out_ptr = out_.item_ptr(row, column);
 
-    if constexpr (use_half2)
+    if constexpr (use_half2 && !odd_rank)
     {
         half2* x_ptr2 = (half2*) x_ptr;
         half2* x_ptr2_end = (half2*) x_ptr_end;
@@ -204,13 +204,22 @@ __global__ void half_matmul_small_kernel
 
         while(x_ptr < x_ptr_end)
         {
-            #pragma unroll
-            for (int i = 0; i < 4; ++i)
+            if constexpr (odd_rank)
             {
                 half x_item = *x_ptr++;
                 half w_item = *w_ptr; w_ptr += width;
                 r = __hfma(x_item, w_item, r);
             }
+            else
+            {
+                #pragma unroll
+                for (int i = 0; i < 4; ++i)
+                {
+                    half x_item = *x_ptr++;
+                    half w_item = *w_ptr; w_ptr += width;
+                    r = __hfma(x_item, w_item, r);
+                }
+            }
         }
 
         __shared__ half accum[MAX_DIM_SMALL / S_BLOCKSIZE][S_THREADS_X];
@@ -242,7 +251,7 @@ void half_matmul_small_cuda
 {
     bool use_half2 = !tuningParams->matmul_no_half2;
 
-    // printf("kernel: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width);
+    //printf("kernel: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width);
 
     dim3 threads
     (
@@ -258,11 +267,18 @@ void half_matmul_small_cuda
         height
     );
 
-    // printf("t... %i %i %i\n", threads.x, threads.y, threads.z);
-    // printf("b... %i %i %i\n", blocks.x, blocks.y, blocks.z);
+    //printf("t... %i %i %i\n", threads.x, threads.y, threads.z);
+    //printf("b... %i %i %i\n", blocks.x, blocks.y, blocks.z);
     //if (!no_zero) cudaMemsetAsync(out, 0, height * width * sizeof(half));
 
-    if (use_half2) half_matmul_small_kernel<true> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
-    else           half_matmul_small_kernel<false><<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
+    if (dim & 0x03)
+    {
+        half_matmul_small_kernel<false, true> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
+    }
+    else
+    {
+        if (use_half2) half_matmul_small_kernel<true,  false> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
+        else           half_matmul_small_kernel<false, false> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
+    }
 }
 

From 2747e00b11a5537c404d1f73e94e184635c305f1 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Thu, 22 Jun 2023 19:33:20 +0200
Subject: [PATCH 15/32] Refactor perplexity test, add options

---
 perplexity.py               | 72 +++++++++++++++++++++++++-----
 test_benchmark_inference.py | 87 +++++++++++++------------------------
 2 files changed, 91 insertions(+), 68 deletions(-)

diff --git a/perplexity.py b/perplexity.py
index 593df6c1..dce65e10 100644
--- a/perplexity.py
+++ b/perplexity.py
@@ -47,20 +47,27 @@ def _tokenize(self, text):
         return self.tokenizer.encode(text)
 
 
-    # This loads *and* tokenizes into chunks
-    def load(self, dataset_path, context=2048, overlap=0, minlength = 0):
+    # Load raw dataset from a text file and tokenize into chunks. Each chunk can optionally truncated to allow for
+    # evaluating the same data at different sequence lengths
+
+    def load(self, dataset_path, chunk_size, chunk_truncate = None, overlap = 0, minlength = 0, json_key = "text"):
+
         file_extension = os.path.splitext(dataset_path)[1]
 
-        # JSON format
+        # JSON format: Returned chunks may be of variable length, with each chunk representing one list item
+
         if file_extension == '.jsonl' or file_extension == '.json':
             with open(dataset_path) as f:
                 for line in f:
-                    example = json.loads(line)["text"]
+                    example = json.loads(line)[json_key]
                     if len(example) > minlength:
                         chunk = self._tokenize(example)
-                        chunk = chunk[:, :context + 1]
+                        chunk = chunk[:, :chunk_size]
+                        if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate]
                         self.dataset_chunks.append(chunk)
-        # Raw Text
+
+        # Raw Text: Returned chunks are fixed length windows of the entire tokenized dataset
+
         else:
             with open(dataset_path) as f:
                 text = f.read()
@@ -68,18 +75,19 @@ def load(self, dataset_path, context=2048, overlap=0, minlength = 0):
             tokens = self._tokenize(text)
 
             # overlap shouldn't be bigger than the context, also need at least one token for predicting last...
-            if overlap >= context:
-                overlap = context-2
+            if overlap >= chunk_size:
+                overlap = chunk_size-2
 
             # We can't use torch.chunks since it want's to split things into equal sized chunks. Instead, let's do our own chunking
             start = 0
             while start < tokens.size(1):
-                chunk = tokens[:, start:start+context]
-                start += context - overlap
+                chunk = tokens[:, start:start + chunk_size]
+                start += chunk_size - overlap
+                if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate]
                 self.dataset_chunks.append(chunk)
 
 
-    def test(self, chunk_limit=sys.maxsize, lora = None, tag="", ppl_token = False):
+    def test(self, chunk_limit = sys.maxsize, lora = None, tag = "", ppl_token = False):
         if not self.dataset_chunks:
             sys.exit(" xx ERROR: Empty dataset!")
 
@@ -126,3 +134,45 @@ def test(self, chunk_limit=sys.maxsize, lora = None, tag="", ppl_token = False):
 
         print("")
         print(f" ** Perplexity{tag}: {perplexity:.4f}")
+
+
+def add_args(parser):
+
+    parser.add_argument("-ppl", "--perplexity", nargs = '?', const = 'default', metavar = "METHOD", help = "Perplexity benchmark. Optionally specify method: gptq-for-llama, llama.cpp (not yet implemented)")
+    parser.add_argument("-ppl-ds", "--perplexity-dataset", metavar = "DATAPATH", type = str, help = "Load dataset for perplexity (JSONL if .jsonl, otherwise parses it as raw text)")
+    parser.add_argument("-ppl-cn", "--perplexity-chunk-num", nargs = "?", type = int, help = "Number of chunks for perplexity benchmark", default = 100)
+    parser.add_argument("-ppl-cs", "--perplexity-chunk-size", type = int, help = "Size of chunks for perplexity benchmark", default = 2048)
+    parser.add_argument("-ppl-ct", "--perplexity-chunk-truncate", type = int, help = "Truncated size of chunks for perplexity benchmark", default = 2048)
+    parser.add_argument("-ppl-co", "--perplexity-chunk-overlap", type = int, help = "Chunk overlap", default = 0)
+    parser.add_argument("-ppl-cm", "--perplexity-chunk-min", type = int, help = "Minimum chunk length", default = 50)
+    parser.add_argument("-ppl-key", "--perplexity-json-key", type = str, help = "Key to extract from JSON dataset, default: 'text'", default = "text")
+    parser.add_argument("-ppl-t", "--perplexity-token", action = "store_true", help = "Run perplexity test on individual tokens, for debug purposes (slow)")
+
+
+def post_parse(args):
+
+    if not args.perplexity: return
+
+    # GPTQ-for-LLaMa equivalent
+
+    if args.perplexity == "gptq-for-llama":
+        args.perplexity_dataset = "datasets/wikitext2.txt"
+        args.perplexity_chunk_num = 128
+        args.perplexity_chunk_size = 2048
+        args.perplexity_chunk_truncate = 2048
+        args.perplexity_chunk_overlap = 0
+        args.perplexity_chunk_min = 0
+
+    # Default dataset for legacy method
+
+    if args.perplexity_dataset is None: args.perplexity_dataset = "datasets/wikitext2_val_sample.jsonl"
+
+    print(f" -- Perplexity:")
+    print(f" -- - Dataset: {args.perplexity_dataset}")
+    print(f" -- - Chunks: {args.perplexity_chunk_num}")
+    print(f" -- - Chunk size: {args.perplexity_chunk_size}" + (f" -> {args.perplexity_chunk_truncate}" if args.perplexity_chunk_truncate is not None else ""))
+    print(f" -- - Chunk overlap: {args.perplexity_chunk_overlap}")
+    print(f" -- - Min. chunk size: {args.perplexity_chunk_min}")
+    print(f" -- - Key: {args.perplexity_json_key}")
+    if args.perplexity_token: print("f -- - Per-token mode")
+
diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
index f0ff059b..14047f53 100644
--- a/test_benchmark_inference.py
+++ b/test_benchmark_inference.py
@@ -2,6 +2,7 @@
 from tokenizer import ExLlamaTokenizer
 from generator import ExLlamaGenerator
 from lora import ExLlamaLora
+import perplexity
 from perplexity import Perplexity
 import time
 import torch
@@ -86,29 +87,33 @@ def mem(name, total = False):
 parser = argparse.ArgumentParser(description = "Benchmark tests for ExLlama")
 
 model_init.add_args(parser)
+perplexity.add_args(parser)
 
 parser.add_argument("-p", "--perf", action = "store_true", help = "Benchmark speed and VRAM usage")
-parser.add_argument("-ppl", "--perplexity", nargs = '?', const = 'default', metavar = "METHOD", help = "Perplexity benchmark (slow). Optionally specify method: default, gptq-for-llama, llama.cpp (not yet implemented)")
-parser.add_argument("-ppl-ds", "--perplexity-dataset", metavar = "DATAPATH", type = str, help = "Load dataset for perplexity (JSONL if .jsonl, otherwise parses it as raw text)")
-parser.add_argument("-ppl-num", "--perplexity-num", nargs = "?", type = int, help = "Number of chunks for perplexity benchmark")
-parser.add_argument("-ppl-t", "--perplexity-token", action = "store_true", help = "Run perplexity test on individual tokens, for debug purposes (slow)")
 parser.add_argument("-v", "--validate", action = "store_true", help = "Quick perplexity benchmark just to test if model is working at all, and short text completion")
 parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark")
 parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark")
+parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark")
 
 args = parser.parse_args()
+
 model_init.post_parse(args)
+perplexity.post_parse(args)
 model_init.get_model_files(args)
 
+# Paths
+
+if args.lora_dir is not None:
+    args.lora_config = os.path.join(args.lora_dir, "adapter_config.json")
+    args.lora = os.path.join(args.lora_dir, "adapter_model.bin")
+
 # Feedback
 
 print_opts = []
 if args.perf: print_opts.append("perf")
-if args.perplexity: print_opts.append("perplexity")
-if args.perplexity_dataset: print_opts.append("perplexity_dataset")
-if args.perplexity_num: print_opts.append("perplexity-num")
-if args.perplexity_token: print_opts.append("perplexity-token")
 if args.validate: print_opts.append("validate")
+if args.perplexity: print_opts.append("perplexity")
+if args.perplexity_token: print_opts.append("perplexity_token")
 
 model_init.print_options(args, print_opts)
 
@@ -128,6 +133,7 @@ def mem(name, total = False):
 
 lora = None
 if args.lora:
+    print(f" -- LoRA config: {args.lora_config}")
     print(f" -- Loading LoRA: {args.lora}")
     if args.lora_config is None:
         print(f" ## Error: please specify lora path to adapter_config.json")
@@ -192,48 +198,18 @@ def mem(name, total = False):
 
     ppl = Perplexity(args.perplexity, model, cache, tokenizer)
 
-    # Default (legacy) method
-
-    testdata_path = "datasets/wikitext2_val_sample.jsonl"
-    testdata_context = 2048
-    testdata_overlap = 0
-    testdata_minlength = 50
-    num_samples = 100
-
-    # Optionally specified dataset, either raw or .jsonl
-
-    if args.perplexity_dataset:
-        testdata_path = args.perplexity_dataset
-        testdata_context = 2048
-        testdata_overlap = 0
-        testdata_minlength = 0
-
-    # Settings mimicking GPTQ-for-LLaMa
-
-    if args.perplexity == "gptq-for-llama":
-        if not args.perplexity_dataset:
-            testdata_path = "datasets/wikitext2.txt"
-        testdata_context = 2048
-        testdata_overlap = 0
-        testdata_minlength = 0
-        num_samples = 128
-
-    if args.perplexity == "default":
-        pass
-
-    # Overrides
-
-    if args.perplexity_num:
-        num_samples = args.perplexity_num
-
     print(" -- Loading dataset...")
 
-    ppl.load(testdata_path,
-             testdata_context,
-             testdata_overlap,
-             testdata_minlength)
+    ppl.load(dataset_path = args.perplexity_dataset,
+             chunk_size = args.perplexity_chunk_size,
+             chunk_truncate = args.perplexity_chunk_truncate,
+             overlap = args.perplexity_chunk_overlap,
+             minlength = args.perplexity_chunk_min,
+             json_key = args.perplexity_json_key)
 
-    ppl.test(num_samples,
+    begin()
+
+    ppl.test(args.perplexity_chunk_num,
              lora = lora,
              ppl_token = args.perplexity_token)
 
@@ -243,20 +219,17 @@ def mem(name, total = False):
 
     ppl = Perplexity(args.perplexity, model, cache, tokenizer)
 
-    testdata_path = "datasets/wikitext2_val_sample.jsonl"
-    testdata_context = 2048
-    testdata_overlap = 0
-    testdata_minlength = 50
+    ppl.load(dataset_path = "datasets/wikitext2_val_sample.jsonl",
+             chunk_size = 2048,
+             chunk_truncate = 2048,
+             overlap = 0,
+             minlength = 50,
+             json_key = "text")
 
-    ppl.load(testdata_path,
-             testdata_context,
-             testdata_overlap,
-             testdata_minlength)
+    # Short perplexity tests in switched and quant mode, should produce roughly equal results
 
     begin()
 
-    # Short perplexity tests in switched and quant mode, should produce roughly equal results
-
     model.config.matmul_recons_thd = 1
     ppl.test(8, lora = lora, tag = " (reconstruct)")
     model.config.matmul_recons_thd = 0

From 2c7bca4596d8adc9dc17c88d704cf6e5e40ed550 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Thu, 22 Jun 2023 19:48:25 +0200
Subject: [PATCH 16/32] Fix perplexity test for chunk_size > 2048

---
 perplexity.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/perplexity.py b/perplexity.py
index dce65e10..fb1ef4a9 100644
--- a/perplexity.py
+++ b/perplexity.py
@@ -32,15 +32,15 @@ def _begin(self):
             self.cache.current_seq_len = 0
 
 
-    def _next_logits(self, input_ids, apply_lora, last_id_only=True):
-        n_logits = None
+    def _next_logits(self, input_ids, apply_lora, last_id_only = True):
+        n_logits = []
         a = 0
         while a < input_ids.shape[-1]:
-            b = min(input_ids.shape[-1], a + 2048)
-            n_logits = self.model.forward(input_ids[:, a:b], self.cache, last_id_only, lora = apply_lora)
+            b = min(input_ids.shape[-1], a + 2048)  # TODO: Should this be a config parameter?
+            n_logits.append(self.model.forward(input_ids[:, a:b], self.cache, last_id_only, lora = apply_lora))
             a = b
 
-        return n_logits
+        return torch.cat(n_logits, dim = 1)
 
 
     def _tokenize(self, text):

From d05f5d01a60af98b09b243290607e5e542551ef5 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Thu, 22 Jun 2023 19:54:09 +0200
Subject: [PATCH 17/32] Add option to compress positional embeddings

---
 model.py      | 3 +++
 model_init.py | 5 +++++
 webui/app.py  | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/model.py b/model.py
index 28a3e79e..885927cd 100644
--- a/model.py
+++ b/model.py
@@ -69,6 +69,7 @@ def __init__(self, model_config_path):
         # Optional settings
 
         self.max_seq_len = 2048  # Reduce to save memory. Can also be increased, but the pretrained models produce degenerate output after 2048 tokens in any case. Should be possible to finetune for longer sequence lengths.
+        self.compress_pos_emb = 1.0  # Increase to compress positional embeddings applied to sequence
         self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to move tensors via system RAM instead, where needed
         self.auto_map = None  # List of floats with memory allocation in GB, per CUDA device, overrides device_map
 
@@ -745,6 +746,8 @@ def __init__(self, config):
 
             inv_freq = 1.0 / (self.config.rotary_embedding_base ** (torch.arange(0, self.config.head_dim, 2, device = device).float() / self.config.head_dim))
             t = torch.arange(self.config.max_seq_len, device = device, dtype = torch.float32)
+            if self.config.compress_pos_emb != 1.0: t /= self.config.compress_pos_emb
+
             freqs = torch.einsum("i,j->ij", t, inv_freq)
             emb = torch.cat((freqs, freqs), dim = -1)
 
diff --git a/model_init.py b/model_init.py
index 7238a461..d61dbac9 100644
--- a/model_init.py
+++ b/model_init.py
@@ -12,6 +12,8 @@ def add_args(parser):
 
     parser.add_argument("-gs", "--gpu_split", type = str, help = "Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. -gs 20,7,7")
     parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048)
+    parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0)
+
     parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs")
 
     parser.add_argument("-mmrt", "--matmul_recons_thd", type = int, help = "No. rows at which to use reconstruction and cuBLAS for quant matmul. 0 = never, 1 = always", default = 8)
@@ -74,6 +76,8 @@ def print_options(args, extra_options = None):
     print(f" -- Model config: {args.config}")
     print(f" -- Model: {args.model}")
     print(f" -- Sequence length: {args.length}")
+    if args.compress_pos_emb != 1.0:
+        print(f" -- RoPE compression factor: {args.compress_pos_emb}")
 
     print(f" -- Tuning:")
     print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else ""))
@@ -98,6 +102,7 @@ def make_config(args):
     config.model_path = args.model
 
     config.max_seq_len = args.length
+    config.compress_pos_emb = args.compress_pos_emb
     config.set_auto_map(args.gpu_split)
     config.gpu_peer_fix = args.gpu_peer_fix
 
diff --git a/webui/app.py b/webui/app.py
index 276b26c5..c2fa909f 100644
--- a/webui/app.py
+++ b/webui/app.py
@@ -128,7 +128,7 @@ def api_append_block():
 
 parser = argparse.ArgumentParser(description="Simple web-based chatbot for ExLlama")
 parser.add_argument("-host", "--host", type = str, help = "IP:PORT eg, 0.0.0.0:7862", default = "localhost:5000")
-parser.add_argument("-sd", "--sessions-dir", type = str, help = "Location for storing user sessions, default: ~/exllama_sessions/", default = "~/exllama_sessions/")
+parser.add_argument("-sd", "--sessions_dir", type = str, help = "Location for storing user sessions, default: ~/exllama_sessions/", default = "~/exllama_sessions/")
 
 model_init.add_args(parser)
 args = parser.parse_args()

From 7f11b1e18a6b8ba961375db9781728b6026c5a29 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Thu, 22 Jun 2023 19:54:48 +0200
Subject: [PATCH 18/32] Consistent naming for command line args

---
 entrypoint.sh |  2 +-
 perplexity.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/entrypoint.sh b/entrypoint.sh
index c03bef7c..74d077f7 100755
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -15,5 +15,5 @@ fi
 # Run service as specified (non-root) user
 exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py \
 	-d $CONTAINER_MODEL_PATH \
-	--sessions-dir $CONTAINER_SESSIONS_PATH \
+	--sessions_dir $CONTAINER_SESSIONS_PATH \
 	$@
diff --git a/perplexity.py b/perplexity.py
index fb1ef4a9..eef41e19 100644
--- a/perplexity.py
+++ b/perplexity.py
@@ -139,14 +139,14 @@ def test(self, chunk_limit = sys.maxsize, lora = None, tag = "", ppl_token = Fal
 def add_args(parser):
 
     parser.add_argument("-ppl", "--perplexity", nargs = '?', const = 'default', metavar = "METHOD", help = "Perplexity benchmark. Optionally specify method: gptq-for-llama, llama.cpp (not yet implemented)")
-    parser.add_argument("-ppl-ds", "--perplexity-dataset", metavar = "DATAPATH", type = str, help = "Load dataset for perplexity (JSONL if .jsonl, otherwise parses it as raw text)")
-    parser.add_argument("-ppl-cn", "--perplexity-chunk-num", nargs = "?", type = int, help = "Number of chunks for perplexity benchmark", default = 100)
-    parser.add_argument("-ppl-cs", "--perplexity-chunk-size", type = int, help = "Size of chunks for perplexity benchmark", default = 2048)
-    parser.add_argument("-ppl-ct", "--perplexity-chunk-truncate", type = int, help = "Truncated size of chunks for perplexity benchmark", default = 2048)
-    parser.add_argument("-ppl-co", "--perplexity-chunk-overlap", type = int, help = "Chunk overlap", default = 0)
-    parser.add_argument("-ppl-cm", "--perplexity-chunk-min", type = int, help = "Minimum chunk length", default = 50)
-    parser.add_argument("-ppl-key", "--perplexity-json-key", type = str, help = "Key to extract from JSON dataset, default: 'text'", default = "text")
-    parser.add_argument("-ppl-t", "--perplexity-token", action = "store_true", help = "Run perplexity test on individual tokens, for debug purposes (slow)")
+    parser.add_argument("-ppl_ds", "--perplexity_dataset", metavar = "DATAPATH", type = str, help = "Load dataset for perplexity (JSONL if .jsonl, otherwise parses it as raw text)")
+    parser.add_argument("-ppl_cn", "--perplexity_chunk_num", nargs = "?", type = int, help = "Number of chunks for perplexity benchmark", default = 100)
+    parser.add_argument("-ppl_cs", "--perplexity_chunk_size", type = int, help = "Size of chunks for perplexity benchmark", default = 2048)
+    parser.add_argument("-ppl_ct", "--perplexity_chunk_truncate", type = int, help = "Truncated size of chunks for perplexity benchmark", default = 2048)
+    parser.add_argument("-ppl_co", "--perplexity_chunk_overlap", type = int, help = "Chunk overlap", default = 0)
+    parser.add_argument("-ppl_cm", "--perplexity_chunk_min", type = int, help = "Minimum chunk length", default = 50)
+    parser.add_argument("-ppl_key", "--perplexity_json_key", type = str, help = "Key to extract from JSON dataset, default: 'text'", default = "text")
+    parser.add_argument("-ppl_t", "--perplexity_token", action = "store_true", help = "Run perplexity test on individual tokens, for debug purposes (slow)")
 
 
 def post_parse(args):

From fc96ae82b951c77bb1fe634a5ab27563c836c655 Mon Sep 17 00:00:00 2001
From: Panchovix <panchovixmenn@gmail.com>
Date: Thu, 22 Jun 2023 14:17:45 -0400
Subject: [PATCH 19/32] Fix AttributeError: 'torch.device' object has no
 attribute 'startswith' (#96)

---
 model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model.py b/model.py
index 28a3e79e..71b01ef7 100644
--- a/model.py
+++ b/model.py
@@ -621,7 +621,7 @@ def _move_tensor(tensor, new_device, name, config):
     device = str(tensor.device)
     if device == new_device: return tensor
     if config.gpu_peer_fix:
-        if device.startswith("cuda:") and new_device.startswith("cuda:"):
+        if str(device).startswith("cuda:") and str(new_device).startswith("cuda:"):
             tensor = tensor.to("cpu")
     return tensor.to(new_device)
 
@@ -880,4 +880,4 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
 
     def free_unmanaged(self):
 
-        cuda_ext.exllama_ext.cleanup()
\ No newline at end of file
+        cuda_ext.exllama_ext.cleanup()

From 21e5630d2cd52b2138133d0d5d9df0b817187c9a Mon Sep 17 00:00:00 2001
From: Allen Benz <allenbenz@users.noreply.github.com>
Date: Thu, 22 Jun 2023 11:41:28 -0700
Subject: [PATCH 20/32] Windows doesn't use utf-8 encoding by default. (#97)

---
 datasets/download_datasets.py | 2 +-
 perplexity.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasets/download_datasets.py b/datasets/download_datasets.py
index 872435cd..804ff7f6 100644
--- a/datasets/download_datasets.py
+++ b/datasets/download_datasets.py
@@ -11,7 +11,7 @@ def download_hf(filename, dataset, subset, split, key, div):
     hf_dataset = load_dataset(dataset, subset, split = split)
     data = div.join(hf_dataset[key])
 
-    with open(filename, "w") as f:
+    with open(filename, "w", encoding="utf-8") as f:
         f.write(data)
 
 download_hf("wikitext2.txt", "wikitext", "wikitext-2-raw-v1", "test", "text", "\n\n")
diff --git a/perplexity.py b/perplexity.py
index 593df6c1..d8c60d1f 100644
--- a/perplexity.py
+++ b/perplexity.py
@@ -62,7 +62,7 @@ def load(self, dataset_path, context=2048, overlap=0, minlength = 0):
                         self.dataset_chunks.append(chunk)
         # Raw Text
         else:
-            with open(dataset_path) as f:
+            with open(dataset_path, encoding="utf-8") as f:
                 text = f.read()
 
             tokens = self._tokenize(text)

From 4ba65279570a264323cdf9c423782e69fec07f7d Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Thu, 22 Jun 2023 21:07:20 +0200
Subject: [PATCH 21/32] Confirm LoRA bias is zero before ignoring

---
 lora.py                     | 3 +++
 test_benchmark_inference.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/lora.py b/lora.py
index 10e84779..d41f9b14 100644
--- a/lora.py
+++ b/lora.py
@@ -59,6 +59,9 @@ def __init__(self, model, lora_config_path, lora_path):
             lora_half = ks[5]
 
             if lora_half == "bias":
+                epsilon = 1e-6
+                if torch.max(tensor) > epsilon or torch.max(tensor) < -epsilon:
+                    raise ValueError(f" ## Error: unsupported bias target {self.lora_path}: {key}")
                 self.bias_ignored = True
                 continue
 
diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
index 14047f53..c8ad68a4 100644
--- a/test_benchmark_inference.py
+++ b/test_benchmark_inference.py
@@ -140,7 +140,7 @@ def mem(name, total = False):
         sys.exit()
     lora = ExLlamaLora(model, args.lora_config, args.lora)
     if lora.bias_ignored:
-        print(f" !! Warning: LoRA bias ignored")
+        print(f" !! Warning: LoRA zero bias ignored")
 
 # Test sequence
 

From 4ee693d6d4689131910c26f0632ec47bbc4fdf4e Mon Sep 17 00:00:00 2001
From: EyeDeck <eyedeck@gmail.com>
Date: Thu, 22 Jun 2023 15:47:32 -0400
Subject: [PATCH 22/32] Fix compiling in venv on Windows (#89)

Torch looks for e.g. python310.lib in an invalid directory while in a Python venv; this checks if we're in a venv and gives the compiler the right directory
---
 cuda_ext.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_ext.py b/cuda_ext.py
index 251ba46e..29b19f55 100644
--- a/cuda_ext.py
+++ b/cuda_ext.py
@@ -57,7 +57,7 @@ def find_msvc():
     ],
     extra_include_paths = [os.path.join(library_dir, "exllama_ext")],
     verbose = verbose,
-    extra_ldflags = ["cublas.lib"] if windows else [],
+    extra_ldflags = (["cublas.lib"] + ([f"/LIBPATH:{os.path.join(sys.base_prefix, 'libs')}"] if sys.base_prefix != sys.prefix else [])) if windows else [],
     extra_cuda_cflags = ["-lineinfo"] + (["-U__HIP_NO_HALF_CONVERSIONS__", "-O3"] if torch.version.hip else []),
     extra_cflags = ["-O3"]
     # extra_cflags = ["-ftime-report", "-DTORCH_USE_CUDA_DSA"]

From 47ee2fd676b4fdcf1e411b2f55dcb526e5cde9e0 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Fri, 23 Jun 2023 00:47:34 +0200
Subject: [PATCH 23/32] Rework repetition penalty

---
 cuda_ext.py                          |  7 +++++
 exllama_ext/cpu_func/rep_penalty.cpp | 45 ++++++++++++++++++++++++++++
 exllama_ext/cpu_func/rep_penalty.h   | 12 ++++++++
 exllama_ext/exllama_ext.cpp          | 30 ++++++++++++++++++-
 generator.py                         | 36 ++++++++++++----------
 5 files changed, 113 insertions(+), 17 deletions(-)

diff --git a/cuda_ext.py b/cuda_ext.py
index 251ba46e..2e602503 100644
--- a/cuda_ext.py
+++ b/cuda_ext.py
@@ -74,6 +74,7 @@ def find_msvc():
 from exllama_ext import rms_norm
 from exllama_ext import rope_
 from exllama_ext import rep_penalty
+from exllama_ext import apply_rep_penalty
 
 
 # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
@@ -158,3 +159,9 @@ def ext_rep_penalty_mask_cpu(vocab_size, sequence, penalty_max, sustain, decay):
     rep_mask = torch.empty(vocab_size, dtype = torch.float32)
     rep_penalty(sequence, rep_mask, penalty_max, sustain, decay)
     return rep_mask
+
+
+def ext_apply_rep_penalty_mask_cpu(sequence, penalty_max, sustain, decay, logits):
+
+    apply_rep_penalty(sequence, penalty_max, sustain, decay, logits)
+
diff --git a/exllama_ext/cpu_func/rep_penalty.cpp b/exllama_ext/cpu_func/rep_penalty.cpp
index c9ffe0b7..26c9dc16 100644
--- a/exllama_ext/cpu_func/rep_penalty.cpp
+++ b/exllama_ext/cpu_func/rep_penalty.cpp
@@ -1,4 +1,6 @@
 #include "rep_penalty.h"
+#include <cstdlib>
+#include <cstring>
 
 void rep_penalty_cpu
 (
@@ -26,4 +28,47 @@ void rep_penalty_cpu
         if (v > rep_mask[t]) rep_mask[t] = v;
         if (--s < 0) v += dv;
     }
+}
+
+bool* g_rep_mask = NULL;
+int g_vocab_size = 0;
+
+void apply_rep_penalty_cpu
+(
+    const int vocab_size,
+    const uint64_t* sequence,
+    const float penalty_max,
+    const int sustain,
+    const int decay,
+    const int seq_len,
+    float* logits
+)
+{
+    if (vocab_size != g_vocab_size)
+    {
+        if (g_rep_mask) free(g_rep_mask);
+        g_vocab_size = vocab_size;
+        g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool));
+    }
+
+    memset(g_rep_mask, 0, g_vocab_size * sizeof(bool));
+
+    float v = penalty_max;
+    float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f;
+
+    int s = sustain == -1 ? seq_len : sustain;
+    int beg = seq_len - sustain - decay;
+    if (beg < 0) beg = 0;
+
+    for (int i = seq_len; i > beg;)
+    {
+        uint64_t t = sequence[--i];
+        if (!g_rep_mask[t])
+        {
+            if (logits[t] > 0.0) logits[t] /= v;
+            else logits[t] *= v;
+            g_rep_mask[t] = true;
+        }
+        if (--s < 0) v += dv;
+    }
 }
\ No newline at end of file
diff --git a/exllama_ext/cpu_func/rep_penalty.h b/exllama_ext/cpu_func/rep_penalty.h
index 895ddd93..4f63b484 100644
--- a/exllama_ext/cpu_func/rep_penalty.h
+++ b/exllama_ext/cpu_func/rep_penalty.h
@@ -15,4 +15,16 @@ void rep_penalty_cpu
     const int seq_len
 );
 
+void apply_rep_penalty_cpu
+(
+    const int vocab_size,
+    const uint64_t* sequence,
+    const float penalty_max,
+    const int sustain,
+    const int decay,
+    const int seq_len,
+    float* logits
+);
+
+
 #endif
diff --git a/exllama_ext/exllama_ext.cpp b/exllama_ext/exllama_ext.cpp
index 5a7f3419..3cfc31d4 100644
--- a/exllama_ext/exllama_ext.cpp
+++ b/exllama_ext/exllama_ext.cpp
@@ -694,6 +694,33 @@ void rep_penalty
     );
 }
 
+void apply_rep_penalty
+(
+    torch::Tensor sequence,
+    float penalty_max,
+    int sustain,
+    int decay,
+    torch::Tensor logits
+)
+{
+    TORCH_CHECK_DTYPE(sequence, kLong);
+    TORCH_CHECK_DTYPE(logits, kFloat);
+
+    int vocab_size = logits.size(-1);
+    int seq_len = sequence.size(-1);
+
+    apply_rep_penalty_cpu
+    (
+        vocab_size,
+        (uint64_t*) sequence.data_ptr(),
+        penalty_max,
+        sustain,
+        decay,
+        seq_len,
+        (float*) logits.data_ptr()
+    );
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("set_tuning_params", &set_tuning_params, "set_tuning_params");
@@ -711,5 +738,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("half_matmul", &half_matmul, "half_matmul");
     m.def("half_matmul_cublas", &half_matmul_cublas, "half_matmul_cublas");
 
-    m.def("rep_penalty", &rep_penalty, "repetition penalty mask");
+    m.def("rep_penalty", &rep_penalty, "rep_penalty");
+    m.def("apply_rep_penalty", &apply_rep_penalty, "apply_rep_penalty");
 }
diff --git a/generator.py b/generator.py
index beb67820..dc00d00b 100644
--- a/generator.py
+++ b/generator.py
@@ -293,7 +293,7 @@ def generate_simple(self, prompt, max_new_tokens = 128):
 
     # Generate a single token with the current settings, append to sequence
 
-    def gen_single_token(self, constraints = None, lora = None):
+    def gen_single_token(self, constraints = None):
 
         self.end_beam_search()
 
@@ -301,12 +301,14 @@ def gen_single_token(self, constraints = None, lora = None):
 
         if self.sequence is not None:
 
-            rep_mask = self.make_rep_mask(self.settings.token_repetition_penalty_max,
-                                          self.settings.token_repetition_penalty_sustain,
-                                          self.settings.token_repetition_penalty_decay)
-
             logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora)
-            logits /= rep_mask
+
+            cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence,
+                                                    self.settings.token_repetition_penalty_max,
+                                                    self.settings.token_repetition_penalty_sustain,
+                                                    self.settings.token_repetition_penalty_decay,
+                                                    logits)
+
             logits[:, :, self.tokenizer.bos_token_id] = -10000.0
 
             if constraints is not None:
@@ -478,13 +480,14 @@ def beam_search(self):
 
                 # Initial tokens for initial beams
 
-                rep_mask = self.make_rep_mask(self.settings.token_repetition_penalty_max,
-                                              self.settings.token_repetition_penalty_sustain,
-                                              self.settings.token_repetition_penalty_decay)
-
                 # self.cache.debug()
                 logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora)
-                logits /= rep_mask
+
+                cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence,
+                                                        self.settings.token_repetition_penalty_max,
+                                                        self.settings.token_repetition_penalty_sustain,
+                                                        self.settings.token_repetition_penalty_decay,
+                                                        logits)
 
                 tokens, probs = self.sample(logits,
                                             self.settings.temperature,
@@ -512,13 +515,14 @@ def beam_search(self):
 
                     beam.to_sequence()
 
-                    rep_mask = self.make_rep_mask(self.settings.token_repetition_penalty_max,
-                                                  self.settings.token_repetition_penalty_sustain,
-                                                  self.settings.token_repetition_penalty_decay)
-
                     # self.cache.debug()
                     logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora)
-                    logits /= rep_mask
+
+                    cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence,
+                                                            self.settings.token_repetition_penalty_max,
+                                                            self.settings.token_repetition_penalty_sustain,
+                                                            self.settings.token_repetition_penalty_decay,
+                                                            logits)
 
                     tokens, probs = self.sample(logits,
                                                 self.settings.temperature,

From 9a686ca9d7986268ecfd3f81ff2a7e1f1b1daf94 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Fri, 23 Jun 2023 01:16:49 +0200
Subject: [PATCH 24/32] Web UI: forward max_seq_len to client

---
 webui/session.py     | 33 +++++++++++++++++----------------
 webui/static/main.js | 10 +++++++---
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/webui/session.py b/webui/session.py
index a4a80f1a..5a715113 100644
--- a/webui/session.py
+++ b/webui/session.py
@@ -282,22 +282,23 @@ def api_populate(self):
                 jnode["author_idx"] = self.participants.index(author)
 
         dic = {"sessions": names,
-                "current_session": name,
-                "fixed_prompt": self.fixed_prompt.text,
-                "keep_fixed_prompt": self.keep_fixed_prompt,
-                "participants": self.participants,
-                "history": historyjson,
-                "temperature": generator.settings.temperature,
-                "top_p": generator.settings.top_p,
-                "min_p": generator.settings.min_p,
-                "top_k": generator.settings.top_k,
-                "typical": generator.settings.typical,
-                "break_on_newline": self.break_on_newline,
-                "max_response_tokens": self.max_response_tokens,
-                "chunk_size": self.chunk_size,
-                "token_repetition_penalty_max": generator.settings.token_repetition_penalty_max,
-                "token_repetition_penalty_sustain": generator.settings.token_repetition_penalty_sustain,
-                "token_repetition_penalty_decay": generator.settings.token_repetition_penalty_decay}
+               "current_session": name,
+               "fixed_prompt": self.fixed_prompt.text,
+               "keep_fixed_prompt": self.keep_fixed_prompt,
+               "participants": self.participants,
+               "history": historyjson,
+               "temperature": generator.settings.temperature,
+               "top_p": generator.settings.top_p,
+               "min_p": generator.settings.min_p,
+               "top_k": generator.settings.top_k,
+               "typical": generator.settings.typical,
+               "break_on_newline": self.break_on_newline,
+               "max_response_tokens": self.max_response_tokens,
+               "chunk_size": self.chunk_size,
+               "token_repetition_penalty_max": generator.settings.token_repetition_penalty_max,
+               "token_repetition_penalty_sustain": generator.settings.token_repetition_penalty_sustain,
+               "token_repetition_penalty_decay": generator.settings.token_repetition_penalty_decay,
+               "max_seq_len": model.config.max_seq_len}
 
         # Add model info
 
diff --git a/webui/static/main.js b/webui/static/main.js
index 9ec07785..09cf1ce1 100644
--- a/webui/static/main.js
+++ b/webui/static/main.js
@@ -65,13 +65,15 @@ function sendGenSettings() {
     json.token_repetition_penalty_sustain = getTBNumber("sl_repp_sustain_tb");
     json.token_repetition_penalty_decay = getTBNumber("sl_repp_decay_tb");
 
-    console.log(json);
+    // console.log(json);
     send("/api/set_gen_settings", json);
 }
 
-function setSlider(id, value) {
+function setSlider(id, value, override_max = null) {
 
     let slider = document.getElementById(id);
+    if (override_max) slider.max = override_max;
+
     let tb = document.getElementById(id + "_tb");
     let decimals = slider.dataset.decimals;
     let mult = Math.pow(10, decimals);
@@ -256,6 +258,8 @@ function populate() {
             let tf_model_info = document.getElementById("tf_model_info")
             tf_model_info.value = data.model_info;
 
+            let model_max_seq_lan = data.max_seq_len;
+
             // Fixed prompt
 
             let tf_fixed_prompt = document.getElementById("tf_fixed_prompt")
@@ -285,7 +289,7 @@ function populate() {
             // Repetition penalty
 
             setSlider("sl_repp_penalty", data.token_repetition_penalty_max);
-            setSlider("sl_repp_sustain", data.token_repetition_penalty_sustain);
+            setSlider("sl_repp_sustain", data.token_repetition_penalty_sustain, model_max_seq_lan);
             setSlider("sl_repp_decay", data.token_repetition_penalty_decay);
 
             // Participants

From 7f31d5e0c2f6c33008ae6f3362e2c04a6dee6252 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Fri, 23 Jun 2023 02:20:20 +0200
Subject: [PATCH 25/32] Limit attention size during generation

---
 generator.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/generator.py b/generator.py
index dc00d00b..8b0dde97 100644
--- a/generator.py
+++ b/generator.py
@@ -4,6 +4,8 @@
 import torch
 import torch.nn.functional as F
 
+DEFAULT_MAX_CHUNK = 2048
+
 class ExLlamaGenerator:
 
     class Settings:
@@ -144,7 +146,7 @@ def disallow_tokens(self, tokens):
         self.disallowed_tokens = tokens
 
 
-    def gen_begin(self, in_tokens):
+    def gen_begin(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
 
         self.end_beam_search()
 
@@ -153,7 +155,11 @@ def gen_begin(self, in_tokens):
         self.cache.current_seq_len = 0
 
         if in_tokens.shape[-1] > 1:
-            self.model.forward(self.sequence[:, :-1], self.cache, preprocess_only = True, lora = self.lora)
+            a = 0
+            while a < self.sequence.shape[-1] - 1:
+                b = min(a + max_chunk, self.sequence.shape[-1] - 1)
+                self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora)
+                a = b
 
 
     def gen_begin_empty(self):
@@ -164,11 +170,11 @@ def gen_begin_empty(self):
         self.cache.current_seq_len = 0
 
 
-    def gen_begin_reuse(self, in_tokens):
+    def gen_begin_reuse(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
 
         self.end_beam_search()
         if self.sequence is None or self.cache.current_seq_len == 0:
-            self.gen_begin(in_tokens)
+            self.gen_begin(in_tokens, max_chunk)
             return 0
 
         # if in_tokens.shape[-1] < self.sequence.shape[-1]:
@@ -179,7 +185,7 @@ def gen_begin_reuse(self, in_tokens):
             reuse += 1
 
         if reuse < 2:
-            self.gen_begin(in_tokens)
+            self.gen_begin(in_tokens, max_chunk)
             return 0
 
         # print (f"Reusing cache: {reuse} tokens")
@@ -192,10 +198,10 @@ def gen_begin_reuse(self, in_tokens):
         return reuse
 
 
-    def gen_feed_tokens(self, in_tokens):
+    def gen_feed_tokens(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
 
         if self.sequence is None:
-            self.gen_begin(in_tokens)
+            self.gen_begin(in_tokens, max_chunk)
             return
 
         self.end_beam_search()
@@ -206,7 +212,12 @@ def gen_feed_tokens(self, in_tokens):
             self.sequence = in_tokens.clone()
         else:
             self.sequence = torch.cat((self.sequence, in_tokens), dim = 1)
-        self.model.forward(self.sequence[:, start:-1], self.cache, preprocess_only = True, lora = self.lora)
+
+        a = start
+        while a < self.sequence.shape[-1] - 1:
+            b = min(a + max_chunk, self.sequence.shape[-1] - 1)
+            self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora)
+            a = b
 
         self.sequence_actual = self.sequence
 

From 248f59fb33535128088c058978bfcd89128565a2 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Fri, 23 Jun 2023 20:28:48 +0200
Subject: [PATCH 26/32] Add LoRA option to CLI chatbot example

---
 example_chatbot.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/example_chatbot.py b/example_chatbot.py
index a8b7f6a8..f9fd9fc5 100644
--- a/example_chatbot.py
+++ b/example_chatbot.py
@@ -1,4 +1,5 @@
 from model import ExLlama, ExLlamaCache, ExLlamaConfig
+from lora import ExLlamaLora
 from tokenizer import ExLlamaTokenizer
 from generator import ExLlamaGenerator
 import argparse
@@ -19,6 +20,10 @@
 
 model_init.add_args(parser)
 
+parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark")
+parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark")
+parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark")
+
 parser.add_argument("-p", "--prompt", type = str, help = "Prompt file")
 parser.add_argument("-un", "--username", type = str, help = "Display name of user", default = "User")
 parser.add_argument("-bn", "--botname", type = str, help = "Display name of chatbot", default = "Chatbort")
@@ -38,6 +43,12 @@
 model_init.post_parse(args)
 model_init.get_model_files(args)
 
+# Paths
+
+if args.lora_dir is not None:
+    args.lora_config = os.path.join(args.lora_dir, "adapter_config.json")
+    args.lora = os.path.join(args.lora_dir, "adapter_model.bin")
+
 # Some feedback
 
 print(f" -- Sequence length: {args.length}")
@@ -54,11 +65,11 @@
 
 model_init.print_options(args, print_opts)
 
+# Load prompt file
+
 username = args.username
 bot_name = args.botname
 
-# Load prompt file
-
 if args.prompt is not None:
     with open(args.prompt, "r") as f:
         past = f.read()
@@ -81,6 +92,21 @@
 
 model_init.print_stats(model)
 
+# Load LoRA
+
+lora = None
+if args.lora:
+    print(f" -- LoRA config: {args.lora_config}")
+    print(f" -- Loading LoRA: {args.lora}")
+    if args.lora_config is None:
+        print(f" ## Error: please specify lora path to adapter_config.json")
+        sys.exit()
+    lora = ExLlamaLora(model, args.lora_config, args.lora)
+    if lora.bias_ignored:
+        print(f" !! Warning: LoRA zero bias ignored")
+
+# Generator
+
 generator = ExLlamaGenerator(model, tokenizer, cache)
 generator.settings = ExLlamaGenerator.Settings()
 generator.settings.temperature = args.temperature
@@ -93,6 +119,8 @@
 generator.settings.beams = args.beams
 generator.settings.beam_length = args.beam_length
 
+generator.lora = lora
+
 break_on_newline = not args.no_newline
 
 # Be nice to Chatbort

From f24617bc3f03737a1b15cf6772deb805d5808058 Mon Sep 17 00:00:00 2001
From: Alexander Ljungberg <aljungberg@wireload.net>
Date: Fri, 23 Jun 2023 21:46:06 +0100
Subject: [PATCH 27/32] Fixed: batching lead to faulty results and crashes.

Without this fix, the rotary embedding code had a buffer overrun for the rotation coefficient tables which would lead to incorrect results and sometimes `CUDA error: an illegal memory access was encountered`. The reason is that we were viewing the input as `position * number of heads` rows of head dimension elements each. So when we had processed all rows for a position, we "incremented" the position and kept going, without taking the result modulus the actual number of rows in the batch. So in the best case we would apply the wrong rotary embedding for batch 1, pos 0 forward, worst case we'd try to apply an embedding for position 2049, which we never generated.

The fix is to stick each batch in its own cuda block and do the right number of rows per batch. One could instead have taken the modulus to find the correct positional ordinal (assuming a contiguous x over the batch dimension), but this solution seemed cleaner.

Also added a batch generation validator and confirmed we now get reasonable-looking results.

Refs #50.
---
 exllama_ext/cuda_func/q4_attn.cu  | 11 ++++---
 exllama_ext/cuda_func/q4_attn.cuh |  1 +
 exllama_ext/cuda_func/rope.cu     | 34 +++++++++++---------
 exllama_ext/cuda_func/rope.cuh    |  1 +
 exllama_ext/exllama_ext.cpp       | 18 +++++++----
 model.py                          |  6 ++--
 test_benchmark_inference.py       | 53 +++++++++++++++++++++++++++++--
 7 files changed, 92 insertions(+), 32 deletions(-)

diff --git a/exllama_ext/cuda_func/q4_attn.cu b/exllama_ext/cuda_func/q4_attn.cu
index 54336962..5c6e1be4 100644
--- a/exllama_ext/cuda_func/q4_attn.cu
+++ b/exllama_ext/cuda_func/q4_attn.cu
@@ -87,6 +87,7 @@ void q4_attn_cuda
     Q4Matrix* v_proj,
     half* sin,
     half* cos,
+    const int bsz,
     const int q_len,
     const int dim,
     const int head_dim,
@@ -119,7 +120,7 @@ void q4_attn_cuda
         ((num_heads + THREADS_Z - 1) / THREADS_Z + BLOCKSIZE_Z - 1) / BLOCKSIZE_Z
     );
 
-    int _rows = q_len * num_heads;
+    int _rows_per_batch = q_len * num_heads;
 
     CudaBuffers* buffers = get_buffers(device_index);
 
@@ -156,8 +157,8 @@ void q4_attn_cuda
 
         // Positional embeddings q, k
 
-        rope_cuda(tuningParams, query_states, sin, cos, _rows, head_dim, num_heads, past_len);
-        rope_cuda(tuningParams, key_states, sin, cos, _rows, head_dim, num_heads, past_len);
+        rope_cuda(tuningParams, query_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len);
+        rope_cuda(tuningParams, key_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len);
 
         // Update cache tensors with projected k, v
 
@@ -177,13 +178,13 @@ void q4_attn_cuda
         // str_1: project q, positions q, sync
 
         q4_matmul_cuda(tuningParams, temp_x, q_len, q_proj, query_states, q_a ? true : false, str_1);
-        rope_cuda(tuningParams, query_states, sin, cos, _rows, head_dim, num_heads, past_len, str_1);
+        rope_cuda(tuningParams, query_states, sin, cos,  bsz, _rows_per_batch, head_dim, num_heads, past_len, str_1);
         cudaEventRecord(sync_1, str_1);
 
         // str_2: project k, positions k, sync
 
         q4_matmul_cuda(tuningParams, temp_x, q_len, k_proj, key_states, k_a ? true : false, str_2);
-        rope_cuda(tuningParams, key_states, sin, cos, _rows, head_dim, num_heads, past_len, str_2);
+        rope_cuda(tuningParams, key_states, sin, cos,  bsz, _rows_per_batch, head_dim, num_heads, past_len, str_2);
         cudaEventRecord(sync_2, str_2);
 
         // str_3: project v, wait for str_2, copy (k,v) to cache, sync
diff --git a/exllama_ext/cuda_func/q4_attn.cuh b/exllama_ext/cuda_func/q4_attn.cuh
index 7a6ae9ae..ce2e29fa 100644
--- a/exllama_ext/cuda_func/q4_attn.cuh
+++ b/exllama_ext/cuda_func/q4_attn.cuh
@@ -24,6 +24,7 @@ void q4_attn_cuda
     Q4Matrix* v_proj,
     half* sin,
     half* cos,
+    const int bsz,
     const int q_len,
     const int dim,
     const int head_dim,
diff --git a/exllama_ext/cuda_func/rope.cu b/exllama_ext/cuda_func/rope.cu
index 5179553c..ef64e9da 100644
--- a/exllama_ext/cuda_func/rope.cu
+++ b/exllama_ext/cuda_func/rope.cu
@@ -23,13 +23,14 @@ __global__ void rope_cuda_kernel
     half* __restrict__ x,
     const half* __restrict__ sin,
     const half* __restrict__ cos,
-    int rows,
+    int rows_per_batch,
     int head_dim,
     int num_heads,
     int past_len
 )
 {
-    MatrixView_half_rw x_(x, rows, head_dim);
+    // These heights aren't used so it's okay if they're wrong.
+    MatrixView_half_rw x_(x, rows_per_batch, head_dim);
     MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim);
     MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim);
 
@@ -37,7 +38,9 @@ __global__ void rope_cuda_kernel
 
     int column = (blockIdx.x * THREADS_X + threadIdx.x); if constexpr (use_half2) column *= 2;
     int row = blockIdx.y * THREADS_Y + threadIdx.y;
-    if (row >= rows) return;
+    if (row >= rows_per_batch) return;
+    int batch_offset = blockIdx.z * rows_per_batch;
+    int row_offset = batch_offset + row;
 
     // Get sin and cos
 
@@ -54,14 +57,14 @@ __global__ void rope_cuda_kernel
 
         // Apply embedding to row
 
-        half2 item2_l = x_.item_half2(row, column);
-        half2 item2_r = x_.item_half2(row, column + half_dim);
+        half2 item2_l = x_.item_half2(row_offset, column);
+        half2 item2_r = x_.item_half2(row_offset, column + half_dim);
         half2 item2_ls = __hmul2(item2_r, sin2_l);
         half2 item2_rs = __hmul2(item2_l, sin2_r);
         item2_l = __hfma2(item2_l, cos2_l, item2_ls);
         item2_r = __hfma2(item2_r, cos2_r, item2_rs);
-        x_.set_half2(row, column, item2_l);
-        x_.set_half2(row, column + half_dim, item2_r);
+        x_.set_half2(row_offset, column, item2_l);
+        x_.set_half2(row_offset, column + half_dim, item2_r);
     }
     else
     {
@@ -73,14 +76,14 @@ __global__ void rope_cuda_kernel
 
         // Apply embedding to row
 
-        half item_l = x_.item(row, column);
-        half item_r = x_.item(row, column + half_dim);
+        half item_l = x_.item(row_offset, column);
+        half item_r = x_.item(row_offset, column + half_dim);
         half item_ls = __hmul(item_r, sin_l);
         half item_rs = __hmul(item_l, sin_r);
         item_l = __hfma(item_l, cos_l, item_ls);
         item_r = __hfma(item_r, cos_r, item_rs);
-        x_.set(row, column, item_l);
-        x_.set(row, column + half_dim, item_r);
+        x_.set(row_offset, column, item_l);
+        x_.set(row_offset, column + half_dim, item_r);
     }
 }
 
@@ -100,7 +103,8 @@ void rope_cuda
     half* x,
     const half* sin,
     const half* cos,
-    const int rows,
+    const int bsz,
+    const int rows_per_batch,
     const int head_dim,
     const int num_heads,
     const int past_len,
@@ -112,10 +116,10 @@ void rope_cuda
     dim3 blocks
     (
         (head_dim + THREADS_X - 1) / THREADS_X / 2 / (tuningParams->rope_no_half2 ? 1 : 2),
-        (rows + THREADS_Y - 1) / THREADS_Y,
-        1
+        (rows_per_batch + THREADS_Y - 1) / THREADS_Y,
+        int(bsz)
     );
 
     fp_rope_cuda_kernel kernel = rope_cuda_kernel_pick(tuningParams);
-    kernel<<<blocks, threads, 0, alt_stream>>>(x, sin, cos, rows, head_dim, num_heads, past_len);
+    kernel<<<blocks, threads, 0, alt_stream>>>(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len);
 }
diff --git a/exllama_ext/cuda_func/rope.cuh b/exllama_ext/cuda_func/rope.cuh
index 64d5b88b..a0ffd33f 100644
--- a/exllama_ext/cuda_func/rope.cuh
+++ b/exllama_ext/cuda_func/rope.cuh
@@ -13,6 +13,7 @@ void rope_cuda
     half* x,
     const half* sin,
     const half* cos,
+    const int bsz,
     const int rows,
     const int head_dim,
     const int num_heads,
diff --git a/exllama_ext/exllama_ext.cpp b/exllama_ext/exllama_ext.cpp
index 3cfc31d4..ec330b09 100644
--- a/exllama_ext/exllama_ext.cpp
+++ b/exllama_ext/exllama_ext.cpp
@@ -418,12 +418,12 @@ void half_matmul_cublas
 
 void q4_attn
 (
-    torch::Tensor x,                // shape == (q_len, dim)
+    torch::Tensor x,                // shape == (bsz, q_len, dim)
     torch::Tensor rms_norm_weight,  // shape == (x.shape[1],) == (dim,)
     float epsilon,
-    torch::Tensor query_states,     // shape == (q_len, dim)
-    torch::Tensor key_states,       // shape == (q_len, dim)
-    torch::Tensor value_states,     // shape == (q_len, dim)
+    torch::Tensor query_states,     // shape == (bsz, q_len, dim)
+    torch::Tensor key_states,       // shape == (bsz, q_len, dim)
+    torch::Tensor value_states,     // shape == (bsz, q_len, dim)
     uintptr_t q_proj,
     uintptr_t k_proj,
     uintptr_t v_proj,
@@ -448,7 +448,8 @@ void q4_attn
     TORCH_CHECK_DTYPE(query_states, kHalf);
     TORCH_CHECK_DTYPE(key_states, kHalf);
 
-    int dim = query_states.size(1);
+    int bsz = query_states.size(0);
+    int dim = query_states.size(2);
 
     torch::Device device = x.device();
     int device_index = device.index();
@@ -477,6 +478,7 @@ void q4_attn
         reinterpret_cast<Q4Matrix*>(v_proj),
         (half*) sin.data_ptr(),
         (half*) cos.data_ptr(),
+        bsz,
         q_len,
         dim,
         head_dim,
@@ -648,7 +650,8 @@ void rope_
     TORCH_CHECK(head_dim == cos.size(-1), "cos table does not match head_dim");
     TORCH_CHECK(head_dim == sin.size(-1), "sin table does not match head_dim");
 
-    int rows = x.numel() / head_dim;
+    int bsz = x.size(0);
+    int rows_per_batch = x.numel() / head_dim / bsz;
 
     const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
 
@@ -658,7 +661,8 @@ void rope_
         (half*) x.data_ptr(),
         (half*) sin.data_ptr(),
         (half*) cos.data_ptr(),
-        rows,
+        bsz,
+        rows_per_batch,
         head_dim,
         num_heads,
         past_len
diff --git a/model.py b/model.py
index 7be22cf6..f7396f1c 100644
--- a/model.py
+++ b/model.py
@@ -310,9 +310,9 @@ def fused(self, hidden_states, cache, buffer, input_layernorm, lora):
 
         # Project q, k, v, apply position embeddings to k and v, update cache
 
-        query_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
-        key_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
-        value_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
+        query_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
+        key_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
+        value_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
 
         cuda_ext.exllama_ext.q4_attn(hidden_states,
                                      input_layernorm.weight,
diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
index c8ad68a4..c00d0c03 100644
--- a/test_benchmark_inference.py
+++ b/test_benchmark_inference.py
@@ -90,7 +90,7 @@ def mem(name, total = False):
 perplexity.add_args(parser)
 
 parser.add_argument("-p", "--perf", action = "store_true", help = "Benchmark speed and VRAM usage")
-parser.add_argument("-v", "--validate", action = "store_true", help = "Quick perplexity benchmark just to test if model is working at all, and short text completion")
+parser.add_argument("-v", "--validate", action = "count", help = "Run validation check and generate some sample output; specify twice for a more thorough test")
 parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark")
 parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark")
 parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark")
@@ -242,7 +242,56 @@ def mem(name, total = False):
     generator = ExLlamaGenerator(model, tokenizer, cache)
     generator.settings.top_k = 1
     generator.lora = lora
-    text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20)
+    text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20 * args.validate)
     # text = generator.generate_simple("To be or", max_new_tokens = 20)
     text = text.replace("\n", "\\n")
     print(f" ** Generation: {text}")
+
+    if args.validate > 1:
+        bsz = 8
+        gen_len = 20
+
+        # Test batched generation
+        saved_logits = []
+        torch.manual_seed(42)
+        torch.cuda.manual_seed_all(42)
+
+        del cache
+        cache = ExLlamaCache(model, batch_size = bsz)
+        identical_batch_prompt = "When you have eliminated the impossible, whatever remains,"
+        continuations = [
+            " must be considered",
+            " ought to be",
+            " (and some scholars say this is",
+            " however improbable, is a banana.",
+        ]
+        ids = []
+        for i in range(bsz - len(continuations)):
+            ids.append(tokenizer.encode(identical_batch_prompt)[0])
+        for cont in continuations:
+            ids.append(tokenizer.encode(identical_batch_prompt + cont)[0])
+        max_length = max([i.shape[0] for i in ids])
+        assert max_length < model.config.max_seq_len, f"Max length {max_length} exceeds model limit {model.config.max_seq_len}"
+        # Left pad with bos tokens
+        for i in range(len(ids)):
+            ids[i] = torch.cat((torch.full((max_length - ids[i].shape[0],), 0), ids[i]), dim = 0)
+        ids = torch.stack(ids, dim = 0)
+
+        sequence = torch.empty((bsz, 0), dtype = torch.long, device = "cpu")
+        logits = next_logits(ids, lora)
+        for i in range(gen_len):
+            logits = logits[:, -1, :]
+            id_per_batch = torch.argmax(logits, dim=-1)
+            assert id_per_batch.shape == (bsz,), f"{id_per_batch.shape} != {(bsz,)}"
+            next_id_per_batch = id_per_batch.unsqueeze(-1)
+            sequence = torch.cat((sequence, next_id_per_batch), dim = -1)
+            logits = next_logits(next_id_per_batch, lora)
+
+        print(f"The first {bsz - len(continuations)} generations should be identical. The remaining generations should be different yet not corrupt.\n\n")
+        separator = tokenizer.encode("...")[0]
+        for b in range(len(ids)):
+            whole = torch.cat((ids[b], separator, sequence[b]), dim = -1)
+            # unpad
+            whole = whole[whole != 0]
+            text = tokenizer.decode(whole)
+            print(f" {b}. {repr(text)}")
\ No newline at end of file

From 20feb557fb422e98435d328c4dda5b08743e8057 Mon Sep 17 00:00:00 2001
From: Alexander Ljungberg <aljungberg@wireload.net>
Date: Fri, 23 Jun 2023 22:28:05 +0100
Subject: [PATCH 28/32] Minor clean-up.

---
 test_benchmark_inference.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
index c00d0c03..30b11804 100644
--- a/test_benchmark_inference.py
+++ b/test_benchmark_inference.py
@@ -248,11 +248,11 @@ def mem(name, total = False):
     print(f" ** Generation: {text}")
 
     if args.validate > 1:
+        # Test batched generation
+
         bsz = 8
         gen_len = 20
 
-        # Test batched generation
-        saved_logits = []
         torch.manual_seed(42)
         torch.cuda.manual_seed_all(42)
 
@@ -272,7 +272,7 @@ def mem(name, total = False):
             ids.append(tokenizer.encode(identical_batch_prompt + cont)[0])
         max_length = max([i.shape[0] for i in ids])
         assert max_length < model.config.max_seq_len, f"Max length {max_length} exceeds model limit {model.config.max_seq_len}"
-        # Left pad with bos tokens
+        # Left pad
         for i in range(len(ids)):
             ids[i] = torch.cat((torch.full((max_length - ids[i].shape[0],), 0), ids[i]), dim = 0)
         ids = torch.stack(ids, dim = 0)
@@ -287,11 +287,13 @@ def mem(name, total = False):
             sequence = torch.cat((sequence, next_id_per_batch), dim = -1)
             logits = next_logits(next_id_per_batch, lora)
 
-        print(f"The first {bsz - len(continuations)} generations should be identical. The remaining generations should be different yet not corrupt.\n\n")
+        print(f"\n ** Batching sanity check: 1-{bsz - len(continuations)} should be identical. All should be reasonable for the model you're using.\n")
         separator = tokenizer.encode("...")[0]
         for b in range(len(ids)):
             whole = torch.cat((ids[b], separator, sequence[b]), dim = -1)
             # unpad
             whole = whole[whole != 0]
             text = tokenizer.decode(whole)
-            print(f" {b}. {repr(text)}")
\ No newline at end of file
+            print(f" {b + 1}. {repr(text)}")
+
+        # TODO Save the logits and then rerun each prompt with a batch size of 1, same input. The logits should be identical.

From 4f1684b977bfd84ba759b799343c00b4041722d6 Mon Sep 17 00:00:00 2001
From: Alexander Ljungberg <aljungberg@wireload.net>
Date: Sat, 24 Jun 2023 14:59:13 +0100
Subject: [PATCH 29/32] Fixed: batching lead to faulty results, crashes and men
 wielding bananas. (#101)

* Fixed: batching lead to faulty results and crashes.

Without this fix, the rotary embedding code had a buffer overrun for the rotation coefficient tables which would lead to incorrect results and sometimes `CUDA error: an illegal memory access was encountered`. The reason is that we were viewing the input as `position * number of heads` rows of head dimension elements each. So when we had processed all rows for a position, we "incremented" the position and kept going, without taking the result modulus the actual number of rows in the batch. So in the best case we would apply the wrong rotary embedding for batch 1, pos 0 forward, worst case we'd try to apply an embedding for position 2049, which we never generated.

The fix is to stick each batch in its own cuda block and do the right number of rows per batch. One could instead have taken the modulus to find the correct positional ordinal (assuming a contiguous x over the batch dimension), but this solution seemed cleaner.

Also added a batch generation validator and confirmed we now get reasonable-looking results.

Refs #50.

* Minor clean-up.

* Fixed: batch test padding not masked out, leading to funny generations for certain models.
---
 exllama_ext/cuda_func/q4_attn.cu  | 11 ++++---
 exllama_ext/cuda_func/q4_attn.cuh |  1 +
 exllama_ext/cuda_func/rope.cu     | 34 ++++++++++---------
 exllama_ext/cuda_func/rope.cuh    |  1 +
 exllama_ext/exllama_ext.cpp       | 18 ++++++----
 model.py                          |  6 ++--
 test_benchmark_inference.py       | 55 +++++++++++++++++++++++++++++--
 7 files changed, 94 insertions(+), 32 deletions(-)

diff --git a/exllama_ext/cuda_func/q4_attn.cu b/exllama_ext/cuda_func/q4_attn.cu
index 54336962..5c6e1be4 100644
--- a/exllama_ext/cuda_func/q4_attn.cu
+++ b/exllama_ext/cuda_func/q4_attn.cu
@@ -87,6 +87,7 @@ void q4_attn_cuda
     Q4Matrix* v_proj,
     half* sin,
     half* cos,
+    const int bsz,
     const int q_len,
     const int dim,
     const int head_dim,
@@ -119,7 +120,7 @@ void q4_attn_cuda
         ((num_heads + THREADS_Z - 1) / THREADS_Z + BLOCKSIZE_Z - 1) / BLOCKSIZE_Z
     );
 
-    int _rows = q_len * num_heads;
+    int _rows_per_batch = q_len * num_heads;
 
     CudaBuffers* buffers = get_buffers(device_index);
 
@@ -156,8 +157,8 @@ void q4_attn_cuda
 
         // Positional embeddings q, k
 
-        rope_cuda(tuningParams, query_states, sin, cos, _rows, head_dim, num_heads, past_len);
-        rope_cuda(tuningParams, key_states, sin, cos, _rows, head_dim, num_heads, past_len);
+        rope_cuda(tuningParams, query_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len);
+        rope_cuda(tuningParams, key_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len);
 
         // Update cache tensors with projected k, v
 
@@ -177,13 +178,13 @@ void q4_attn_cuda
         // str_1: project q, positions q, sync
 
         q4_matmul_cuda(tuningParams, temp_x, q_len, q_proj, query_states, q_a ? true : false, str_1);
-        rope_cuda(tuningParams, query_states, sin, cos, _rows, head_dim, num_heads, past_len, str_1);
+        rope_cuda(tuningParams, query_states, sin, cos,  bsz, _rows_per_batch, head_dim, num_heads, past_len, str_1);
         cudaEventRecord(sync_1, str_1);
 
         // str_2: project k, positions k, sync
 
         q4_matmul_cuda(tuningParams, temp_x, q_len, k_proj, key_states, k_a ? true : false, str_2);
-        rope_cuda(tuningParams, key_states, sin, cos, _rows, head_dim, num_heads, past_len, str_2);
+        rope_cuda(tuningParams, key_states, sin, cos,  bsz, _rows_per_batch, head_dim, num_heads, past_len, str_2);
         cudaEventRecord(sync_2, str_2);
 
         // str_3: project v, wait for str_2, copy (k,v) to cache, sync
diff --git a/exllama_ext/cuda_func/q4_attn.cuh b/exllama_ext/cuda_func/q4_attn.cuh
index 7a6ae9ae..ce2e29fa 100644
--- a/exllama_ext/cuda_func/q4_attn.cuh
+++ b/exllama_ext/cuda_func/q4_attn.cuh
@@ -24,6 +24,7 @@ void q4_attn_cuda
     Q4Matrix* v_proj,
     half* sin,
     half* cos,
+    const int bsz,
     const int q_len,
     const int dim,
     const int head_dim,
diff --git a/exllama_ext/cuda_func/rope.cu b/exllama_ext/cuda_func/rope.cu
index 5179553c..ef64e9da 100644
--- a/exllama_ext/cuda_func/rope.cu
+++ b/exllama_ext/cuda_func/rope.cu
@@ -23,13 +23,14 @@ __global__ void rope_cuda_kernel
     half* __restrict__ x,
     const half* __restrict__ sin,
     const half* __restrict__ cos,
-    int rows,
+    int rows_per_batch,
     int head_dim,
     int num_heads,
     int past_len
 )
 {
-    MatrixView_half_rw x_(x, rows, head_dim);
+    // These heights aren't used so it's okay if they're wrong.
+    MatrixView_half_rw x_(x, rows_per_batch, head_dim);
     MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim);
     MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim);
 
@@ -37,7 +38,9 @@ __global__ void rope_cuda_kernel
 
     int column = (blockIdx.x * THREADS_X + threadIdx.x); if constexpr (use_half2) column *= 2;
     int row = blockIdx.y * THREADS_Y + threadIdx.y;
-    if (row >= rows) return;
+    if (row >= rows_per_batch) return;
+    int batch_offset = blockIdx.z * rows_per_batch;
+    int row_offset = batch_offset + row;
 
     // Get sin and cos
 
@@ -54,14 +57,14 @@ __global__ void rope_cuda_kernel
 
         // Apply embedding to row
 
-        half2 item2_l = x_.item_half2(row, column);
-        half2 item2_r = x_.item_half2(row, column + half_dim);
+        half2 item2_l = x_.item_half2(row_offset, column);
+        half2 item2_r = x_.item_half2(row_offset, column + half_dim);
         half2 item2_ls = __hmul2(item2_r, sin2_l);
         half2 item2_rs = __hmul2(item2_l, sin2_r);
         item2_l = __hfma2(item2_l, cos2_l, item2_ls);
         item2_r = __hfma2(item2_r, cos2_r, item2_rs);
-        x_.set_half2(row, column, item2_l);
-        x_.set_half2(row, column + half_dim, item2_r);
+        x_.set_half2(row_offset, column, item2_l);
+        x_.set_half2(row_offset, column + half_dim, item2_r);
     }
     else
     {
@@ -73,14 +76,14 @@ __global__ void rope_cuda_kernel
 
         // Apply embedding to row
 
-        half item_l = x_.item(row, column);
-        half item_r = x_.item(row, column + half_dim);
+        half item_l = x_.item(row_offset, column);
+        half item_r = x_.item(row_offset, column + half_dim);
         half item_ls = __hmul(item_r, sin_l);
         half item_rs = __hmul(item_l, sin_r);
         item_l = __hfma(item_l, cos_l, item_ls);
         item_r = __hfma(item_r, cos_r, item_rs);
-        x_.set(row, column, item_l);
-        x_.set(row, column + half_dim, item_r);
+        x_.set(row_offset, column, item_l);
+        x_.set(row_offset, column + half_dim, item_r);
     }
 }
 
@@ -100,7 +103,8 @@ void rope_cuda
     half* x,
     const half* sin,
     const half* cos,
-    const int rows,
+    const int bsz,
+    const int rows_per_batch,
     const int head_dim,
     const int num_heads,
     const int past_len,
@@ -112,10 +116,10 @@ void rope_cuda
     dim3 blocks
     (
         (head_dim + THREADS_X - 1) / THREADS_X / 2 / (tuningParams->rope_no_half2 ? 1 : 2),
-        (rows + THREADS_Y - 1) / THREADS_Y,
-        1
+        (rows_per_batch + THREADS_Y - 1) / THREADS_Y,
+        int(bsz)
     );
 
     fp_rope_cuda_kernel kernel = rope_cuda_kernel_pick(tuningParams);
-    kernel<<<blocks, threads, 0, alt_stream>>>(x, sin, cos, rows, head_dim, num_heads, past_len);
+    kernel<<<blocks, threads, 0, alt_stream>>>(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len);
 }
diff --git a/exllama_ext/cuda_func/rope.cuh b/exllama_ext/cuda_func/rope.cuh
index 64d5b88b..a0ffd33f 100644
--- a/exllama_ext/cuda_func/rope.cuh
+++ b/exllama_ext/cuda_func/rope.cuh
@@ -13,6 +13,7 @@ void rope_cuda
     half* x,
     const half* sin,
     const half* cos,
+    const int bsz,
     const int rows,
     const int head_dim,
     const int num_heads,
diff --git a/exllama_ext/exllama_ext.cpp b/exllama_ext/exllama_ext.cpp
index 3cfc31d4..ec330b09 100644
--- a/exllama_ext/exllama_ext.cpp
+++ b/exllama_ext/exllama_ext.cpp
@@ -418,12 +418,12 @@ void half_matmul_cublas
 
 void q4_attn
 (
-    torch::Tensor x,                // shape == (q_len, dim)
+    torch::Tensor x,                // shape == (bsz, q_len, dim)
     torch::Tensor rms_norm_weight,  // shape == (x.shape[1],) == (dim,)
     float epsilon,
-    torch::Tensor query_states,     // shape == (q_len, dim)
-    torch::Tensor key_states,       // shape == (q_len, dim)
-    torch::Tensor value_states,     // shape == (q_len, dim)
+    torch::Tensor query_states,     // shape == (bsz, q_len, dim)
+    torch::Tensor key_states,       // shape == (bsz, q_len, dim)
+    torch::Tensor value_states,     // shape == (bsz, q_len, dim)
     uintptr_t q_proj,
     uintptr_t k_proj,
     uintptr_t v_proj,
@@ -448,7 +448,8 @@ void q4_attn
     TORCH_CHECK_DTYPE(query_states, kHalf);
     TORCH_CHECK_DTYPE(key_states, kHalf);
 
-    int dim = query_states.size(1);
+    int bsz = query_states.size(0);
+    int dim = query_states.size(2);
 
     torch::Device device = x.device();
     int device_index = device.index();
@@ -477,6 +478,7 @@ void q4_attn
         reinterpret_cast<Q4Matrix*>(v_proj),
         (half*) sin.data_ptr(),
         (half*) cos.data_ptr(),
+        bsz,
         q_len,
         dim,
         head_dim,
@@ -648,7 +650,8 @@ void rope_
     TORCH_CHECK(head_dim == cos.size(-1), "cos table does not match head_dim");
     TORCH_CHECK(head_dim == sin.size(-1), "sin table does not match head_dim");
 
-    int rows = x.numel() / head_dim;
+    int bsz = x.size(0);
+    int rows_per_batch = x.numel() / head_dim / bsz;
 
     const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
 
@@ -658,7 +661,8 @@ void rope_
         (half*) x.data_ptr(),
         (half*) sin.data_ptr(),
         (half*) cos.data_ptr(),
-        rows,
+        bsz,
+        rows_per_batch,
         head_dim,
         num_heads,
         past_len
diff --git a/model.py b/model.py
index 7be22cf6..f7396f1c 100644
--- a/model.py
+++ b/model.py
@@ -310,9 +310,9 @@ def fused(self, hidden_states, cache, buffer, input_layernorm, lora):
 
         # Project q, k, v, apply position embeddings to k and v, update cache
 
-        query_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
-        key_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
-        value_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
+        query_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
+        key_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
+        value_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
 
         cuda_ext.exllama_ext.q4_attn(hidden_states,
                                      input_layernorm.weight,
diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
index c8ad68a4..ec8edd8c 100644
--- a/test_benchmark_inference.py
+++ b/test_benchmark_inference.py
@@ -90,7 +90,7 @@ def mem(name, total = False):
 perplexity.add_args(parser)
 
 parser.add_argument("-p", "--perf", action = "store_true", help = "Benchmark speed and VRAM usage")
-parser.add_argument("-v", "--validate", action = "store_true", help = "Quick perplexity benchmark just to test if model is working at all, and short text completion")
+parser.add_argument("-v", "--validate", action = "count", help = "Run validation check and generate some sample output; specify twice for a more thorough test")
 parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark")
 parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark")
 parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark")
@@ -242,7 +242,58 @@ def mem(name, total = False):
     generator = ExLlamaGenerator(model, tokenizer, cache)
     generator.settings.top_k = 1
     generator.lora = lora
-    text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20)
+    text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20 * args.validate)
     # text = generator.generate_simple("To be or", max_new_tokens = 20)
     text = text.replace("\n", "\\n")
     print(f" ** Generation: {text}")
+
+    if args.validate > 1:
+        # Test batched generation
+
+        bsz = 8
+        gen_len = 20
+
+        torch.manual_seed(42)
+        torch.cuda.manual_seed_all(42)
+
+        del cache
+        cache = ExLlamaCache(model, batch_size = bsz)
+        identical_batch_prompt = "When you have eliminated the impossible, whatever remains,"
+        continuations = [
+            " must be considered",
+            " ought to be",
+            " (and some scholars say this is",
+            " however improbable, is a banana.",
+        ]
+        ids = []
+        for i in range(bsz - len(continuations)):
+            ids.append(tokenizer.encode(identical_batch_prompt)[0])
+        for cont in continuations:
+            ids.append(tokenizer.encode(identical_batch_prompt + cont)[0])
+        max_length = max([i.shape[0] for i in ids])
+        assert max_length < model.config.max_seq_len, f"Max length {max_length} exceeds model limit {model.config.max_seq_len}"
+
+        # Left pad with spaces because we can't pass an attention mask to the model
+        space_token = tokenizer.encode(" ")[0].item()
+        for i in range(len(ids)):
+            ids[i] = torch.cat((torch.full((max_length - ids[i].shape[0],), space_token), ids[i]), dim = 0)
+        ids = torch.stack(ids, dim = 0)
+
+        sequence = torch.empty((bsz, 0), dtype = torch.long, device = "cpu")
+        logits = next_logits(ids, lora)
+        for i in range(gen_len):
+            logits = logits[:, -1, :]
+            id_per_batch = torch.argmax(logits, dim=-1)
+            assert id_per_batch.shape == (bsz,), f"{id_per_batch.shape} != {(bsz,)}"
+            next_id_per_batch = id_per_batch.unsqueeze(-1)
+            sequence = torch.cat((sequence, next_id_per_batch), dim = -1)
+            logits = next_logits(next_id_per_batch, lora)
+
+        print(f"\n ** Batching sanity check: 1-{bsz - len(continuations)} should be identical. All should be reasonable for the model you're using.\n")
+        separator = tokenizer.encode("...")[0]
+        for b in range(len(ids)):
+            whole = torch.cat((ids[b], separator, sequence[b]), dim = -1)
+            text = tokenizer.decode(whole)
+            print(f" {b + 1}. {repr(text)}")
+
+        # TODO Save the logits and then rerun each prompt with a batch size of 1, same input. The logits should be identical.

From 696c2bb737bd01e0cca7bc116a398c0e41794d09 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Sat, 24 Jun 2023 17:17:44 +0200
Subject: [PATCH 30/32] Add optional input mask to forward pass, masking for
 batch test

---
 model.py                    | 18 +++++++++++++-----
 test_benchmark_inference.py | 14 +++++++++-----
 tokenizer.py                |  1 +
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/model.py b/model.py
index f7396f1c..c963eb27 100644
--- a/model.py
+++ b/model.py
@@ -404,8 +404,7 @@ def forward(self, hidden_states, cache, buffer, lora):
 
             attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
             attn_weights /= math.sqrt(self.config.head_dim)
-            if buffer.attn_mask is not None and buffer.attn_mask.shape[2] > 1: attn_weights = attn_weights + buffer.attn_mask
-            # attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            if buffer.attn_mask is not None: attn_weights = attn_weights + buffer.attn_mask
             attn_weights = nn.functional.softmax(attn_weights, dim = -1, dtype = torch.float16).to(query_states.dtype)
             attn_output = torch.matmul(attn_weights, value_states)
             attn_output = attn_output.transpose(1, 2)
@@ -418,7 +417,7 @@ def forward(self, hidden_states, cache, buffer, lora):
             # it can only apply a square attention mask. It saves quite a bit of VRAM but in practice Torch seems to use
             # the same amount of memory at peak anyway.
 
-            if past_len > 0:
+            if past_len > 0 or (bsz > 1 and buffer.attn_mask is not None):
                 attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = buffer.attn_mask, is_causal = False)
             else:
                 attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = None, is_causal = True)
@@ -801,7 +800,9 @@ def __init__(self, config):
         torch.cuda.empty_cache()
 
 
-    def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None, output_device = None):
+    def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None, output_device = None, input_mask = None):
+
+        assert input_mask is None or input_mask.shape == input_ids.shape
 
         # if torch.is_grad_enabled():
         #     raise ValueError("Forward pass called with gradients enabled. Back propagation is not supported yet.")
@@ -820,9 +821,16 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
             if seq_len > 1:
 
                 attn_mask = torch.zeros(batch_size, 1, seq_len, past_len + seq_len, dtype = torch.float16, device = devs[0])
-                attn_mask_triu = torch.triu(torch.full((seq_len - 1, seq_len - 1), torch.finfo(torch.float16).min))
+                attn_mask_triu = torch.triu(torch.full((seq_len - 1, seq_len - 1), -65504.))
                 attn_mask[:, :, : seq_len - 1, past_len + 1: past_len + seq_len] = attn_mask_triu
 
+                if input_mask is not None:
+
+                    input_mask = _move_tensor(input_mask, devs[0], "input_mask", self.config)
+                    input_mask = torch.where(input_mask, 0, -65504.).half()
+                    input_mask = input_mask.unsqueeze(1).unsqueeze(2)
+                    attn_mask = torch.minimum(attn_mask, input_mask)
+
             else:
 
                 attn_mask = None
diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
index 30b11804..6a60d539 100644
--- a/test_benchmark_inference.py
+++ b/test_benchmark_inference.py
@@ -31,14 +31,14 @@ def begin():
     else: cache.current_seq_len = 0
 
 
-def next_logits(input_ids, apply_lora, last_id_only = True):
+def next_logits(input_ids, apply_lora, last_id_only = True, input_mask = None):
     global model, cache
 
     n_logits = None
     a = 0
     while a < input_ids.shape[-1]:
         b = min(input_ids.shape[-1], a + 2048)
-        n_logits = model.forward(input_ids[:, a:b], cache, last_id_only, lora = apply_lora)
+        n_logits = model.forward(input_ids[:, a:b], cache, last_id_only, lora = apply_lora, input_mask = input_mask)
         a = b
 
     return n_logits
@@ -243,7 +243,6 @@ def mem(name, total = False):
     generator.settings.top_k = 1
     generator.lora = lora
     text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20 * args.validate)
-    # text = generator.generate_simple("To be or", max_new_tokens = 20)
     text = text.replace("\n", "\\n")
     print(f" ** Generation: {text}")
 
@@ -271,14 +270,19 @@ def mem(name, total = False):
         for cont in continuations:
             ids.append(tokenizer.encode(identical_batch_prompt + cont)[0])
         max_length = max([i.shape[0] for i in ids])
+
         assert max_length < model.config.max_seq_len, f"Max length {max_length} exceeds model limit {model.config.max_seq_len}"
+
         # Left pad
         for i in range(len(ids)):
-            ids[i] = torch.cat((torch.full((max_length - ids[i].shape[0],), 0), ids[i]), dim = 0)
+            ids[i] = torch.cat((torch.full((max_length - ids[i].shape[0],), tokenizer.pad_token_id), ids[i]), dim = 0)
+
         ids = torch.stack(ids, dim = 0)
+        mask = ids.ne(tokenizer.pad_token_id)
 
         sequence = torch.empty((bsz, 0), dtype = torch.long, device = "cpu")
-        logits = next_logits(ids, lora)
+        logits = next_logits(ids, lora, input_mask = mask)
+
         for i in range(gen_len):
             logits = logits[:, -1, :]
             id_per_batch = torch.argmax(logits, dim=-1)
diff --git a/tokenizer.py b/tokenizer.py
index 0c5a3196..55696ec7 100644
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -10,6 +10,7 @@ def __init__(self, tokenizer_model_path):
         self.tokenizer = SentencePieceProcessor(model_file = self.path)
         self.eos_token_id = self.tokenizer.eos_id()
         self.bos_token_id = self.tokenizer.bos_id()
+        self.pad_token_id = 0
         self.newline_token_id = 13
 
     def encode(self, text):

From cdb6f54c059b7afb08be5352a97df5d87a6f83f1 Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Sat, 24 Jun 2023 19:53:49 +0200
Subject: [PATCH 31/32] Add batched encode/decode to tokenizer

---
 test_benchmark_inference.py | 41 ++++++++++++++++++------------------
 tokenizer.py                | 42 ++++++++++++++++++++++++++++++++-----
 2 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
index 6a60d539..0c438631 100644
--- a/test_benchmark_inference.py
+++ b/test_benchmark_inference.py
@@ -243,20 +243,24 @@ def mem(name, total = False):
     generator.settings.top_k = 1
     generator.lora = lora
     text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20 * args.validate)
-    text = text.replace("\n", "\\n")
-    print(f" ** Generation: {text}")
+    print(f" ** Generation: {repr(text)}")
 
     if args.validate > 1:
+
         # Test batched generation
 
         bsz = 8
         gen_len = 20
-
         torch.manual_seed(42)
         torch.cuda.manual_seed_all(42)
 
+        # Bigger cache for the batch
+
         del cache
         cache = ExLlamaCache(model, batch_size = bsz)
+
+        # Create tokenized batch and attention mask
+
         identical_batch_prompt = "When you have eliminated the impossible, whatever remains,"
         continuations = [
             " must be considered",
@@ -264,22 +268,18 @@ def mem(name, total = False):
             " (and some scholars say this is",
             " however improbable, is a banana.",
         ]
-        ids = []
-        for i in range(bsz - len(continuations)):
-            ids.append(tokenizer.encode(identical_batch_prompt)[0])
-        for cont in continuations:
-            ids.append(tokenizer.encode(identical_batch_prompt + cont)[0])
-        max_length = max([i.shape[0] for i in ids])
 
-        assert max_length < model.config.max_seq_len, f"Max length {max_length} exceeds model limit {model.config.max_seq_len}"
+        prompts = [identical_batch_prompt] * (bsz - len(continuations))
+        for cont in continuations:
+            prompts.append(identical_batch_prompt + cont)
 
-        # Left pad
-        for i in range(len(ids)):
-            ids[i] = torch.cat((torch.full((max_length - ids[i].shape[0],), tokenizer.pad_token_id), ids[i]), dim = 0)
+        ids = tokenizer.encode(prompts)
+        assert ids.shape[1] < model.config.max_seq_len, f"Max length {ids.shape[1]} exceeds model limit {model.config.max_seq_len}"
 
-        ids = torch.stack(ids, dim = 0)
         mask = ids.ne(tokenizer.pad_token_id)
 
+        # Batched generation with greedy sampling
+
         sequence = torch.empty((bsz, 0), dtype = torch.long, device = "cpu")
         logits = next_logits(ids, lora, input_mask = mask)
 
@@ -291,13 +291,12 @@ def mem(name, total = False):
             sequence = torch.cat((sequence, next_id_per_batch), dim = -1)
             logits = next_logits(next_id_per_batch, lora)
 
+        # Print output batch
+
         print(f"\n ** Batching sanity check: 1-{bsz - len(continuations)} should be identical. All should be reasonable for the model you're using.\n")
-        separator = tokenizer.encode("...")[0]
-        for b in range(len(ids)):
-            whole = torch.cat((ids[b], separator, sequence[b]), dim = -1)
-            # unpad
-            whole = whole[whole != 0]
-            text = tokenizer.decode(whole)
-            print(f" {b + 1}. {repr(text)}")
+
+        outputs = tokenizer.decode(sequence)
+        for b in range(bsz):
+            print(f"{b + 1} {repr(prompts[b])} -> {repr(outputs[b])}")
 
         # TODO Save the logits and then rerun each prompt with a batch size of 1, same input. The logits should be identical.
diff --git a/tokenizer.py b/tokenizer.py
index 55696ec7..2fbc7c47 100644
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -13,16 +13,48 @@ def __init__(self, tokenizer_model_path):
         self.pad_token_id = 0
         self.newline_token_id = 13
 
+    # Encode string
+
     def encode(self, text):
 
-        ids = self.tokenizer.Encode(text)
-        return torch.tensor(ids).unsqueeze(0)
+        if isinstance(text, list):
+
+            # text is a list of strings
+
+            list_ids = self.tokenizer.Encode(text)
+            max_length = max([len(ids) for ids in list_ids])
+
+            padded_ids = []
+            for ids in list_ids:
+                padding = torch.full((max_length - len(ids),), self.pad_token_id)
+                sequence = torch.tensor(ids)
+                padded_ids.append(torch.cat((padding, sequence), dim = 0))
+
+            return torch.stack(padded_ids, dim = 0)
+
+        else:
+
+            # text is a single string
+
+            ids = self.tokenizer.Encode(text)
+            return torch.tensor(ids).unsqueeze(0)
 
     def decode(self, ids):
 
-        ids = ids.tolist()
-        text = self.tokenizer.Decode(ids)
-        return text
+        if ids.dim() > 1:
+
+            texts = []
+            for i in range(ids.shape[0]):
+                seq = ids[i].tolist()
+                seq = [t for t in seq if t != self.pad_token_id]
+                texts.append(self.tokenizer.Decode(seq))
+            return texts
+
+        else:
+
+            ids = ids.tolist()
+            text = self.tokenizer.Decode(ids)
+            return text
 
     def num_tokens(self, text):
 

From a01b25c884881871a0f75c96bbc582b6581665cb Mon Sep 17 00:00:00 2001
From: turboderp <turboderp@users.noreply.github.com>
Date: Sat, 24 Jun 2023 21:32:01 +0200
Subject: [PATCH 32/32] Add batch support to generate_simple(), also example

---
 example_basic.py            |  2 +-
 example_batch.py            | 56 +++++++++++++++++++++++++++++++++++++
 exllama_ext/exllama_ext.cpp | 27 +++++++++++-------
 generator.py                | 41 ++++++++++++++++++++-------
 tokenizer.py                |  1 +
 5 files changed, 106 insertions(+), 21 deletions(-)
 create mode 100644 example_batch.py

diff --git a/example_basic.py b/example_basic.py
index 8db3b424..e95d0adb 100644
--- a/example_basic.py
+++ b/example_basic.py
@@ -3,7 +3,7 @@
 from generator import ExLlamaGenerator
 import os, glob
 
-# Directory containt model, tokenizer, generator
+# Directory containing model, tokenizer, generator
 
 model_directory =  "/mnt/str/models/llama-13b-4bit-128g/"
 
diff --git a/example_batch.py b/example_batch.py
new file mode 100644
index 00000000..179cf2f4
--- /dev/null
+++ b/example_batch.py
@@ -0,0 +1,56 @@
+from model import ExLlama, ExLlamaCache, ExLlamaConfig
+from tokenizer import ExLlamaTokenizer
+from generator import ExLlamaGenerator
+import os, glob
+
+# Directory containing model, tokenizer, generator
+
+model_directory =  "/mnt/str/models/llama-13b-4bit-128g/"
+
+# Locate files we need within that directory
+
+tokenizer_path = os.path.join(model_directory, "tokenizer.model")
+model_config_path = os.path.join(model_directory, "config.json")
+st_pattern = os.path.join(model_directory, "*.safetensors")
+model_path = glob.glob(st_pattern)[0]
+
+# Batched prompts
+
+prompts = [
+    "Once upon a time,",
+    "I don't like to",
+    "A turbo encabulator is a",
+    "In the words of Mark Twain,"
+]
+
+# Create config, model, tokenizer and generator
+
+config = ExLlamaConfig(model_config_path)               # create config from config.json
+config.model_path = model_path                          # supply path to model weights file
+
+model = ExLlama(config)                                 # create ExLlama instance and load the weights
+tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
+
+cache = ExLlamaCache(model, batch_size = len(prompts))  # create cache for inference
+generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
+
+# Configure generator
+
+generator.disallow_tokens([tokenizer.eos_token_id])
+
+generator.settings.token_repetition_penalty_max = 1.2
+generator.settings.temperature = 0.95
+generator.settings.top_p = 0.65
+generator.settings.top_k = 100
+generator.settings.typical = 0.5
+
+# Generate, batched
+
+for line in prompts:
+    print(line)
+
+output = generator.generate_simple(prompts, max_new_tokens = 200)
+
+for line in output:
+    print("---")
+    print(line)
diff --git a/exllama_ext/exllama_ext.cpp b/exllama_ext/exllama_ext.cpp
index ec330b09..615f7f4f 100644
--- a/exllama_ext/exllama_ext.cpp
+++ b/exllama_ext/exllama_ext.cpp
@@ -686,6 +686,8 @@ void rep_penalty
     int vocab_size = rep_mask.size(0);
     int seq_len = sequence.size(-1);
 
+    // TODO: Support batch size
+
     rep_penalty_cpu
     (
         vocab_size,
@@ -709,20 +711,25 @@ void apply_rep_penalty
 {
     TORCH_CHECK_DTYPE(sequence, kLong);
     TORCH_CHECK_DTYPE(logits, kFloat);
+    TORCH_CHECK_SHAPES(sequence, 0, logits, 0, 1);
 
     int vocab_size = logits.size(-1);
+    int bsz = sequence.size(0);
     int seq_len = sequence.size(-1);
 
-    apply_rep_penalty_cpu
-    (
-        vocab_size,
-        (uint64_t*) sequence.data_ptr(),
-        penalty_max,
-        sustain,
-        decay,
-        seq_len,
-        (float*) logits.data_ptr()
-    );
+    for (int i = 0; i < bsz; i++)
+    {
+        apply_rep_penalty_cpu
+        (
+            vocab_size,
+            ((uint64_t*) sequence.data_ptr()) + i * seq_len,
+            penalty_max,
+            sustain,
+            decay,
+            seq_len,
+            ((float*) logits.data_ptr()) + i * vocab_size
+        );
+    }
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
diff --git a/generator.py b/generator.py
index 8b0dde97..638cba63 100644
--- a/generator.py
+++ b/generator.py
@@ -61,11 +61,27 @@ def make_rep_mask(self, penalty_max, sustain, decay):
         return cuda_ext.ext_rep_penalty_mask_cpu(self.model.config.vocab_size, self.sequence, penalty_max, sustain, decay)
 
 
+    def batched_sample(self, logits, temperature, top_k, top_p, min_p, typical, num = 1):
+
+        if logits.shape[0] == 1: return self.sample(logits, temperature, top_k, top_p, min_p, typical, num)
+
+        samples = []
+        scores = []
+        for i in range(logits.shape[0]):
+            t, s = self.sample(logits[i, :, :], temperature, top_k, top_p, min_p, typical)
+            samples.append(t)
+            scores.append(s)
+
+        return torch.cat(samples, dim = 0), torch.cat(scores, dim = 0)
+
+
     def sample(self, logits, temperature, top_k, top_p, min_p, typical, num = 1):
 
         # torch.manual_seed(42)
 
-        logits = logits[0, -1, :]
+        if logits.dim() == 3: logits = logits[0, -1, :]
+        elif logits.dim() == 2: logits = logits[-1, :]
+        else: raise ValueError("Bad logits dimension")
 
         # Disallow tokens
 
@@ -285,7 +301,7 @@ def gen_num_tokens(self):
         return self.sequence_actual.shape[-1]
 
 
-    # Generate some number of tokens and append to
+    # Simple generator function
 
     def generate_simple(self, prompt, max_new_tokens = 128):
 
@@ -294,11 +310,16 @@ def generate_simple(self, prompt, max_new_tokens = 128):
         ids = self.tokenizer.encode(prompt)
         self.gen_begin(ids)
 
+        max_new_tokens = min(max_new_tokens, self.model.config.max_seq_len - ids.shape[1])
+
+        eos = torch.zeros((ids.shape[0],), dtype = torch.bool)
         for i in range(max_new_tokens):
             token = self.gen_single_token()
-            if token.item() == self.tokenizer.eos_token_id: break
+            for j in range(token.shape[0]):
+                if token[j, 0].item() == self.tokenizer.eos_token_id: eos[j] = True
+            if eos.all(): break
 
-        text = self.tokenizer.decode(self.sequence[0])
+        text = self.tokenizer.decode(self.sequence[0] if self.sequence.shape[0] == 1 else self.sequence)
         return text
 
 
@@ -327,12 +348,12 @@ def gen_single_token(self, constraints = None):
                 for c in constraints: logits[:, :, c] += 10000.0
                 logits[:, :, :] -= 10000.0
 
-            token, _ = self.sample(logits,
-                                   self.settings.temperature,
-                                   self.settings.top_k,
-                                   self.settings.top_p,
-                                   self.settings.min_p + 0.01 if constraints is not None else 0.0,
-                                   self.settings.typical)
+            token, _ = self.batched_sample(logits,
+                                           self.settings.temperature,
+                                           self.settings.top_k,
+                                           self.settings.top_p,
+                                           self.settings.min_p + 0.01 if constraints is not None else 0.0,
+                                           self.settings.typical)
 
         else:
 
diff --git a/tokenizer.py b/tokenizer.py
index 2fbc7c47..eeb4fb5f 100644
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -47,6 +47,7 @@ def decode(self, ids):
             for i in range(ids.shape[0]):
                 seq = ids[i].tolist()
                 seq = [t for t in seq if t != self.pad_token_id]
+                if self.eos_token_id in seq: seq = seq[:seq.index(self.eos_token_id)]
                 texts.append(self.tokenizer.Decode(seq))
             return texts