diff --git a/.env b/.env
index 0b6f65f3..3ab8b5a3 100644
--- a/.env
+++ b/.env
@@ -1,4 +1,5 @@
 PORT=5000
+RUN_UID=1000  # set to 0 to run the service as root inside the container
+APPLICATION_STATE_PATH=/data  # path to the directory holding application state inside the container
 MODEL_PATH=models/LLaMA-7B-4bit-128g  # replace with the actual model path on the host
-CONTAINER_MODEL_PATH=/app/model
-SESSIONS_PATH=./exllama_sessions
+SESSIONS_PATH=~/exllama_sessions  # replace with the actual directory on the host where chat sessions should be stored
diff --git a/Dockerfile b/Dockerfile
index b7dd15ce..829e9dad 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,22 +1,31 @@
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 as build
-
-ENV RUN_UID=1000
+ARG RUN_UID="1000" \
+    APPLICATION_STATE_PATH="/data"
+ENV RUN_UID=$RUN_UID \
+    APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH \
+    CONTAINER_MODEL_PATH=$APPLICATION_STATE_PATH/model \
+    CONTAINER_SESSIONS_PATH=$APPLICATION_STATE_PATH/exllama_sessions
 
 RUN apt-get update && \
     DEBIAN_FRONTEND=noninteractive apt-get install -y ninja-build python3 python3-pip && \
     rm -rf /var/lib/apt/lists/*
 
-# Setup user which will run the service
-RUN useradd -m -u $RUN_UID user
-USER user
+# Setup user which will run the service and create application state directory
+RUN if [ ${RUN_UID} -ne 0 ] ; then useradd -m -u $RUN_UID user ; fi \
+    && mkdir -p $APPLICATION_STATE_PATH \
+    && mkdir -p $CONTAINER_MODEL_PATH \
+    && mkdir -p $CONTAINER_SESSIONS_PATH \
+    && chown -R $RUN_UID $APPLICATION_STATE_PATH
+USER $RUN_UID
 
-COPY --chown=user . /app
+COPY --chown=$RUN_UID . /app
 
 WORKDIR /app
 
+# Create application state directory and install python packages
 RUN pip install --upgrade pip setuptools wheel \
     && pip install -r requirements.txt \
-    && pip install flask==2.3.2
+    && pip install -r requirements-web.txt
 
 USER root
 
diff --git a/README.md b/README.md
index d73f5b65..e5b0074d 100644
--- a/README.md
+++ b/README.md
@@ -12,11 +12,16 @@ incompatibilities with older cards.
 
 ## Dependencies
 
+* Python 3.9 or newer
 * `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118
 * `safetensors` 0.3.1
 * `sentencepiece`
 * `ninja`
-* `flask` (only for the web UI)
+
+Additionally, only for the web UI:
+
+* `flask`
+* `waitress`
 
 ## Linux/WSL prerequisites
 
@@ -30,7 +35,7 @@ To run on Windows (without WSL):
 Studio 2022` IDE, or alternatively just the `Build Tools for Visual Studio 2022` package (make sure `Desktop
 development with C++` is ticked in the installer), it doesn't really matter which.
 2. Install the appropriate version of [PyTorch](https://pytorch.org/get-started/locally/), choosing one of the CUDA
-versions. I am developing on the nightly build, but the stable version should also work.
+versions. I am developing on the nightly build, but the stable version (2.0.1) should also work.
 3. Install CUDA Toolkit, ([11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) and 
 [11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive) both seem to work, just make sure to match PyTorch's
 Compute Platform version).
@@ -40,7 +45,7 @@ Compute Platform version).
 
 Install dependencies, clone repo and run benchmark:
 
-    pip install safetensors sentencepiece ninja
+    pip install -r requirements.txt
 
     git clone https://github.com/turboderp/exllama
     cd exllama
@@ -65,11 +70,12 @@ multibot mode:
 
 To run it:
 
-    pip install flask
+    pip install -r requirements-web.txt
 
     python webui/app.py -d <path_to_model_files>
 
-Note that sessions are stored in `~/exllama_sessions/`. 
+Note that sessions are stored in `~/exllama_sessions/`. You can change the location of the sessions storage with `-sd`
+if you want.
 
 ## Docker
 For security benefits and easier deployment, it is also possible to run the web UI in an isolated docker container. Note: the docker image currently only supports NVIDIA GPUs.
@@ -91,7 +97,13 @@ docker compose build
 It is also possible to manually build the image:
 
 ```
-docker build -t exllama-web
+docker build -t exllama-web .
+```
+
+NOTE: by default, the service inside the docker container is run by a non-root user. Hence, the ownership of bind-mounted directories (`/data/model` and `/data/exllama_sessions` in the default `docker-compose.yml` file) is changed to this non-root user in the container entrypoint (`entrypoint.sh`). To disable this, set `RUN_UID=0` in the `.env` file if using `docker compose`, or the following command if you manually build the image:
+
+```
+docker build -t exllama-web --build-arg RUN_UID=0 .
 ```
 
 ### Run
@@ -109,7 +121,7 @@ The configuration can be viewed in `docker-compose.yml` and changed by creating
 Run manually: 
 
 ```
-docker run --gpus all -p 5000:5000 -v <path_to_model_files>:/app/model/ --rm -it exllama-web --host 0.0.0.0:5000
+docker run --gpus all -p 5000:5000 -v <path_to_model_dir>:/data/model/ -v <path_to_session_dir>:/data/exllama_sessions --rm -it exllama-web --host 0.0.0.0:5000
 ```
 
 
@@ -179,20 +191,6 @@ confirmed to be working right now.
 
 ## Recent updates
 
-**2023-05-24**: Added fused rotary embeddings and some minor optimizations. 13% faster on 7B, 9% on 13B. Small
-improvement on larger models. Added best-case scores to benchmark results and some clarification. For easier
-comparisons to other implementations, or whatever.
-
-**2023-05-27**: Better memory management in CUDA. Introduced auto switch between Torch's SDP backend and regular 
-matmul attention with some tweaks. Finished CUDA MLP. All in all about 10% faster with these updates.
-
-**2023-05-29**: Web UI is _almost_ up and running. Having to learn JavaScript, and it turns out I hate JavaScript. But
-ChatGPT is an incredible resource for learning new languages, I gotta say, so it's not as painful as it could have
-been. Anyway, in the process of working with the UI I discovered I've been measuring prompt speed incorrectly. Either
-Torch or CUDA or the GPU driver does some sort of caching or self-calibration or lazy initialization during the first
-pass through the model, so subsequent passes are actually _way_ faster than what I've been recording. Doesn't do much
-for individual tokens, but benchmarks updated anyway. Closing in on 10k tokens/second for 7B. (!)
-
 **2023-06-02**: Web UI is now in a fairly working state. Expect it to be a little scuffed in places. There will be a
 rewrite at some point to make the client-side code less seizure-inducing. It has multibot mode, chat rewind and editing
 features, sessions, and more. I'm going to build it out with support for instruct prompting and such, in time.
@@ -216,5 +214,5 @@ disabled by default. YMMV. Use `-cs` to try it out.
 **2023-06-17**: Fixed a nasty bug in the fused attention that was causing slightly incorrect cache states on 13B and
 33B models. You definitely want to update.
 
-**2023-06-18**: LoRA support now. Still needs a lot of testing and som optimization, and currently you can't stack
-multiple LoRAs during the same inference. There's also no support in the web UI yet.
\ No newline at end of file
+**2023-06-18**: LoRA support now. Still needs a lot of testing and some optimization, and currently you can't stack
+multiple LoRAs during the same inference. There's also no support in the web UI yet.
diff --git a/cuda_ext.py b/cuda_ext.py
index 91b4397b..5efd3d4a 100644
--- a/cuda_ext.py
+++ b/cuda_ext.py
@@ -30,14 +30,15 @@ def find_msvc():
     
     import subprocess
     try:
-        subprocess.check_output(["where", "cl"])
+        subprocess.check_output(["where", "/Q", "cl"])
     except subprocess.CalledProcessError as e:
         cl_path = find_msvc()
         if cl_path:
-            print("Injected compiler path:", cl_path)
+            if verbose:
+                print("Injected compiler path:", cl_path)
             os.environ["path"] += ";" + cl_path
         else:
-            print("Unable to find cl.exe; compilation will probably fail.")
+            print("Unable to find cl.exe; compilation will probably fail.", file=sys.stderr)
 
 exllama_ext = load(
     name = extension_name,
@@ -56,7 +57,7 @@ def find_msvc():
     ],
     extra_include_paths = [os.path.join(library_dir, "exllama_ext")],
     verbose = verbose,
-    extra_ldflags = ["cublas.lib"] if windows else [],
+    extra_ldflags = (["cublas.lib"] + ([f"/LIBPATH:{os.path.join(sys.base_prefix, 'libs')}"] if sys.base_prefix != sys.prefix else [])) if windows else [],
     extra_cuda_cflags = ["-lineinfo"] + (["-U__HIP_NO_HALF_CONVERSIONS__", "-O3"] if torch.version.hip else []),
     extra_cflags = ["-O3"]
     # extra_cflags = ["-ftime-report", "-DTORCH_USE_CUDA_DSA"]
@@ -73,6 +74,7 @@ def find_msvc():
 from exllama_ext import rms_norm
 from exllama_ext import rope_
 from exllama_ext import rep_penalty
+from exllama_ext import apply_rep_penalty
 
 
 # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
@@ -157,3 +159,9 @@ def ext_rep_penalty_mask_cpu(vocab_size, sequence, penalty_max, sustain, decay):
     rep_mask = torch.empty(vocab_size, dtype = torch.float32)
     rep_penalty(sequence, rep_mask, penalty_max, sustain, decay)
     return rep_mask
+
+
+def ext_apply_rep_penalty_mask_cpu(sequence, penalty_max, sustain, decay, logits):
+
+    apply_rep_penalty(sequence, penalty_max, sustain, decay, logits)
+
diff --git a/cuda_test/compile.sh b/cuda_test/compile.sh
deleted file mode 100755
index 3f04d83b..00000000
--- a/cuda_test/compile.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-/opt/cuda/bin/nvcc -isystem /opt/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 --compiler-options '-fPIC' -std=c++17 \
-test.cu \
-../exllama_ext/q4v2_mlp.cu \
-../exllama_ext/rms_norm.cu \
-../exllama_ext/q4v2_matmul.cu \
-../exllama_ext/column_remap.cu \
--o ./test
diff --git a/cuda_test/test.cu b/cuda_test/test.cu
deleted file mode 100644
index 47a2e69b..00000000
--- a/cuda_test/test.cu
+++ /dev/null
@@ -1,339 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <chrono>
-#include <iostream>
-#include <sstream>
-#include <locale>
-#include <iomanip>
-#include <limits>
-
-#include "../exllama_ext/util.h"
-#include "../exllama_ext/matrix.h"
-#include "../exllama_ext/q4v2_mlp.h"
-
-using namespace std;
-
-template <typename T>
-class Tensor
-{
-public:
-    T* data_cuda;
-    T* data_cpu;
-    uint32_t height;
-    uint32_t width;
-
-    // From file
-
-    Tensor(const char* filename)
-    {
-        FILE* file = fopen(filename, "rb");
-        if (!file)
-        {
-            cout << "File not found: " << filename << "\n";
-            return;
-        }
-
-        fseek(file, 0, SEEK_END);
-        long size = ftell(file);
-        fseek(file, 0, SEEK_SET);
-        size_t num_elements = size / sizeof(T);
-
-        if (num_elements == 0)
-        {
-            data_cuda = NULL;
-            data_cpu = NULL;
-            height = 0;
-            width = 0;
-
-            cout << " ** " << filename << " (None)\n";
-            return;
-        }
-
-        data_cpu = new T[num_elements];
-        fread(data_cpu, sizeof(T), num_elements, file);
-        fclose(file);
-
-        char filenameshape[1024];
-        strcpy(filenameshape, filename);
-        strcat(filenameshape, ".shape");
-
-        file = fopen(filenameshape, "rb");
-        if (!file)
-        {
-            cout << "File not found: " << filenameshape << "\n";
-            return;
-        }
-
-        fread(&height, 1, sizeof(uint32_t), file);
-        fread(&width, 1, sizeof(uint32_t), file);
-        fclose(file);
-
-        if (width * height != num_elements)
-        {
-            cout << "Incorrect shape: " << filenameshape << "\n";
-            return;
-        }
-
-        cudaMalloc(&data_cuda, size);
-        dataToCUDA();
-
-        cout << " ** " << filename << " (" << height << ", " << width << ")\n";
-    }
-
-    // Empty tensor
-
-    Tensor(int _height, int _width)
-    {
-        height = _height;
-        width = _width;
-
-        size_t size = (height * width) * sizeof(T);
-        cudaMalloc(&data_cuda, size);
-
-        data_cpu = new T[height * width];
-    }
-
-    // Zero tensor
-
-    Tensor(int _height, int _width, T zero_value)
-    {
-        height = _height;
-        width = _width;
-
-        size_t size = (height * width) * sizeof(T);
-        cudaMalloc(&data_cuda, size);
-
-        data_cpu = new T[height * width];
-
-        for (int i = 0; i < _width * _height; i++) data_cpu[i] = zero_value;
-        dataToCUDA();
-    }
-
-    // Fill
-
-    void fill(T value)
-    {
-        for (int i = 0; i < width * height; i++) data_cpu[i] = value;
-        dataToCUDA();
-    }
-
-    // Copy data
-
-    void dataToCUDA()
-    {
-        size_t size = (height * width) * sizeof(T);
-        cudaMemcpy(data_cuda, data_cpu, size, cudaMemcpyHostToDevice);
-        cudaDeviceSynchronize();
-    }
-
-    void dataToCPU()
-    {
-        size_t size = (height * width) * sizeof(T);
-        cudaMemcpy(data_cpu, data_cuda, size, cudaMemcpyDeviceToHost);
-        cudaDeviceSynchronize();
-    }
-
-};
-
-__global__ void dummyKernel()
-{
-    // Dummy kernel
-}
-
-void warmUpCUDA()
-{
-    // Create a CUDA context
-    cudaFree(0);
-
-    // Launch a dummy kernel
-    dummyKernel<<<1, 1>>>();
-    cudaDeviceSynchronize();
-}
-
-
-template <typename T>
-float compareTensors(Tensor<T>& a, Tensor<T>& b, int height = 0, int width = 0)
-{
-    if (height == 0 && (a.width != b.width || a.height != b.height))
-    {
-        cout << "Incompatible sizes.\n";
-        return std::numeric_limits<float>::infinity();
-    }
-
-    if (height == 0) height = a.height;
-    if (width == 0) width = a.width;
-
-    a.dataToCPU();
-    b.dataToCPU();
-
-    float m = 0.0f;
-
-    for (int r = 0; r < height; r++)
-    {
-        for (int c = 0; c < width; c++)
-        {
-            float a_f = __half2float(a.data_cpu[r * a.width + c]);
-            float b_f = __half2float(b.data_cpu[r * b.width + c]);
-            m = fmax(m, fabs(a_f - b_f));
-        }
-    }
-
-    return m;
-}
-
-void printTensor(Tensor<half>& a)
-{
-    int width = 8; if (width > a.width) width = a.width;
-    int height = 8; if (height > a.height) height = a.height;
-
-    a.dataToCPU();
-
-    for (int c = 0; c < width; c++) cout << "---------";
-    cout << "\n";
-
-    for (int r = 0; r < height; r++)
-    {
-        for (int c = 0; c < width; c++)
-        {
-            float a_f = __half2float(a.data_cpu[r * a.width + c]);
-            cout << setfill(' ') << setprecision(5) << setw(9) << a_f << dec;
-        }
-        cout << "\n";
-    }
-}
-
-void printTensor(Tensor<uint32_t>& a)
-{
-    int width = 8; if (width > a.width) width = a.width;
-    int height = 8; if (height > a.height) height = a.height;
-
-    a.dataToCPU();
-
-    for (int c = 0; c < width; c++) cout << "---------";
-    cout << "\n";
-
-    for (int r = 0; r < height; r++)
-    {
-        for (int c = 0; c < width; c++)
-        {
-            uint32_t a_i = a.data_cpu[r * a.width + c];
-            cout << " " << setfill('0') << setw(8) << hex << a_i << dec << setfill(' ');
-        }
-        cout << "\n";
-    }
-}
-
-
-int main()
-{
-    warmUpCUDA();
-    int iters;
-
-    cout << fixed << setprecision(6);
-    cout << "Loading tensors...\n";
-
-    // Test MLP
-
-    Tensor<half>        x                    ("mlp/test_mlp_x");
-    Tensor<half>        x_gated              ("mlp/test_mlp_x_gated");
-    Tensor<half>        x_done               ("mlp/test_mlp_x_done");
-    Tensor<half>        x_prenorm            ("mlp/test_mlp_x_prenorm");
-    Tensor<half>        x_postresidual       ("mlp/test_mlp_x_postresidual");
-
-    Tensor<half>        rms_norm_weight      ("mlp/test_mlp_norm_weight");
-
-    Tensor<half>        up_proj_bias         ("mlp/up_proj.bias");
-    Tensor<uint32_t>    up_proj_qweight      ("mlp/up_proj.qweight");
-    Tensor<uint32_t>    up_proj_qzeros       ("mlp/up_proj.qzeros");
-    Tensor<half>        up_proj_scales       ("mlp/up_proj.scales");
-    Tensor<uint16_t>    up_proj_seq_g_idx    ("mlp/up_proj.seq_g_idx");
-    Tensor<uint32_t>    up_proj_x_map        ("mlp/up_proj.x_map");
-
-    Tensor<half>        gate_proj_bias       ("mlp/gate_proj.bias");
-    Tensor<uint32_t>    gate_proj_qweight    ("mlp/gate_proj.qweight");
-    Tensor<uint32_t>    gate_proj_qzeros     ("mlp/gate_proj.qzeros");
-    Tensor<half>        gate_proj_scales     ("mlp/gate_proj.scales");
-    Tensor<uint16_t>    gate_proj_seq_g_idx  ("mlp/gate_proj.seq_g_idx");
-    Tensor<uint32_t>    gate_proj_x_map      ("mlp/gate_proj.x_map");
-
-    Tensor<half>        down_proj_bias       ("mlp/down_proj.bias");
-    Tensor<uint32_t>    down_proj_qweight    ("mlp/down_proj.qweight");
-    Tensor<uint32_t>    down_proj_qzeros     ("mlp/down_proj.qzeros");
-    Tensor<half>        down_proj_scales     ("mlp/down_proj.scales");
-    Tensor<uint16_t>    down_proj_seq_g_idx  ("mlp/down_proj.seq_g_idx");
-    Tensor<uint32_t>    down_proj_x_map      ("mlp/down_proj.x_map");
-
-    Tensor<half> x_temp(x.height, x.width);
-    Tensor<float> x_col_temp(1, x.height);
-    Tensor<half> x_act_temp(x.height, gate_proj_qweight.width);
-
-    Tensor<half> out(x_gated.height, x_gated.width);
-
-    int groupsize = gate_proj_qweight.height * 8 / gate_proj_qzeros.height;
-
-    iters = 1;
-    auto start_time = chrono::high_resolution_clock::now();
-
-    cout << "--------\n";
-
-    cout << "Fused MLP (" << iters << " iterations)... ";
-
-    for (int i = 0; i < iters; i++)
-    {
-        q4v2_mlp_cuda
-        (
-            x_prenorm.data_cuda,            // input
-
-            x_temp.data_cuda,               // input, normalized (empty)
-            x_col_temp.data_cuda,           // temp for norm (empty)
-            x_act_temp.data_cuda,           // temp for act(x @ gate) * x @ up (empty)
-
-            rms_norm_weight.data_cuda,
-            (1e-06),
-
-            gate_proj_qweight.data_cuda,
-            gate_proj_scales.data_cuda,
-            gate_proj_qzeros.data_cuda,
-            gate_proj_seq_g_idx.data_cuda,
-            gate_proj_x_map.data_cuda,
-
-            up_proj_qweight.data_cuda,
-            up_proj_scales.data_cuda,
-            up_proj_qzeros.data_cuda,
-            up_proj_seq_g_idx.data_cuda,
-            up_proj_x_map.data_cuda,
-
-            down_proj_qweight.data_cuda,
-            down_proj_scales.data_cuda,
-            down_proj_qzeros.data_cuda,
-            down_proj_seq_g_idx.data_cuda,
-            down_proj_x_map.data_cuda,
-
-            x.height,
-            x.width,
-            gate_proj_qweight.width,
-            groupsize
-        );
-    }
-
-    cudaDeviceSynchronize();
-
-    auto end_time = chrono::high_resolution_clock::now();
-    auto duration = chrono::duration_cast<chrono::microseconds>(end_time - start_time).count();
-    duration /= iters;
-    cout << duration << " us / iteration\n";
-
-    cout << "Validating fused MLP... ";
-
-    float diff = compareTensors<half>(x_prenorm, x_postresidual);
-
-    cout << "max diff.: " << diff <<"\n";
-
-    printTensor(x_prenorm);
-    printTensor(x_postresidual);
-
-    printf("Done\n");
-    return 0;
-}
diff --git a/datasets/download_datasets.py b/datasets/download_datasets.py
index 872435cd..804ff7f6 100644
--- a/datasets/download_datasets.py
+++ b/datasets/download_datasets.py
@@ -11,7 +11,7 @@ def download_hf(filename, dataset, subset, split, key, div):
     hf_dataset = load_dataset(dataset, subset, split = split)
     data = div.join(hf_dataset[key])
 
-    with open(filename, "w") as f:
+    with open(filename, "w", encoding="utf-8") as f:
         f.write(data)
 
 download_hf("wikitext2.txt", "wikitext", "wikitext-2-raw-v1", "test", "text", "\n\n")
diff --git a/docker-compose.yml b/docker-compose.yml
index 5e47c1c2..369b6864 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,15 +4,16 @@ services:
   web:
     build:
       context: .
+      args:
+        - RUN_UID=$RUN_UID
+        - APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH
     command: |
       --host 0.0.0.0:$PORT
     env_file:
       - .env
-    environment:
-      - CONTAINER_MODEL_PATH=$CONTAINER_MODEL_PATH
     volumes:
-      - $MODEL_PATH:$CONTAINER_MODEL_PATH
-      - $SESSIONS_PATH:/home/user/exllama_sessions
+      - $MODEL_PATH:$APPLICATION_STATE_PATH/model
+      - $SESSIONS_PATH:$APPLICATION_STATE_PATH/exllama_sessions
     ports:
       - "$PORT:$PORT"
     tmpfs:
diff --git a/entrypoint.sh b/entrypoint.sh
index 79864aa8..74d077f7 100755
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,15 +1,19 @@
 #!/usr/bin/env bash
 set -Eeuo pipefail
 
-# Ensure that the model path is set
-if [ -z $CONTAINER_MODEL_PATH ]; then
-  echo "Must specify model path"
+# Ensure that the application state path is set
+if [ -z $APPLICATION_STATE_PATH ]; then
+  echo "Must specify application state path"
   exit 1
 fi
 
-# Ensure that bind-mounted directories are owned by the user that runs the service
-chown -R $RUN_UID:$RUN_UID $CONTAINER_MODEL_PATH
-chown -R $RUN_UID:$RUN_UID /home/user/exllama_sessions
+# Ensure that bind-mounted directories are owned by the user that runs the service if the user is not root
+if [ $RUN_UID -ne 0 ]; then
+  chown -R $RUN_UID:$RUN_UID $APPLICATION_STATE_PATH
+fi
 
 # Run service as specified (non-root) user
-exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py -d $CONTAINER_MODEL_PATH $@
+exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py \
+	-d $CONTAINER_MODEL_PATH \
+	--sessions_dir $CONTAINER_SESSIONS_PATH \
+	$@
diff --git a/example_basic.py b/example_basic.py
deleted file mode 100644
index 8db3b424..00000000
--- a/example_basic.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from model import ExLlama, ExLlamaCache, ExLlamaConfig
-from tokenizer import ExLlamaTokenizer
-from generator import ExLlamaGenerator
-import os, glob
-
-# Directory containt model, tokenizer, generator
-
-model_directory =  "/mnt/str/models/llama-13b-4bit-128g/"
-
-# Locate files we need within that directory
-
-tokenizer_path = os.path.join(model_directory, "tokenizer.model")
-model_config_path = os.path.join(model_directory, "config.json")
-st_pattern = os.path.join(model_directory, "*.safetensors")
-model_path = glob.glob(st_pattern)[0]
-
-# Create config, model, tokenizer and generator
-
-config = ExLlamaConfig(model_config_path)               # create config from config.json
-config.model_path = model_path                          # supply path to model weights file
-
-model = ExLlama(config)                                 # create ExLlama instance and load the weights
-tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
-
-cache = ExLlamaCache(model)                             # create cache for inference
-generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
-
-# Configure generator
-
-generator.disallow_tokens([tokenizer.eos_token_id])
-
-generator.settings.token_repetition_penalty_max = 1.2
-generator.settings.temperature = 0.95
-generator.settings.top_p = 0.65
-generator.settings.top_k = 100
-generator.settings.typical = 0.5
-
-# Produce a simple generation
-
-prompt = "Once upon a time,"
-print (prompt, end = "")
-
-output = generator.generate_simple(prompt, max_new_tokens = 200)
-
-print(output[len(prompt):])
diff --git a/example_chatbot.py b/example_chatbot.py
deleted file mode 100644
index 6e450d22..00000000
--- a/example_chatbot.py
+++ /dev/null
@@ -1,209 +0,0 @@
-import argparse
-import torch
-import sys
-import os
-import glob
-import model_init
-
-from .model import ExLlama, ExLlamaCache, ExLlamaConfig
-from .tokenizer import ExLlamaTokenizer
-from .generator import ExLlamaGenerator
-
-# Simple interactive chatbot script
-
-torch.set_grad_enabled(False)
-torch.cuda._lazy_init()
-
-# Parse arguments
-
-parser = argparse.ArgumentParser(description = "Simple chatbot example for ExLlama")
-
-model_init.add_args(parser)
-
-parser.add_argument("-p", "--prompt", type = str, help = "Prompt file")
-parser.add_argument("-un", "--username", type = str, help = "Display name of user", default = "User")
-parser.add_argument("-bn", "--botname", type = str, help = "Display name of chatbot", default = "Chatbort")
-parser.add_argument("-bf", "--botfirst", action = "store_true", help = "Start chat on bot's turn")
-
-parser.add_argument("-nnl", "--no_newline", action = "store_true", help = "Do not break bot's response on newline (allow multi-paragraph responses)")
-parser.add_argument("-temp", "--temperature", type = float, help = "Temperature", default = 0.95)
-parser.add_argument("-topk", "--top_k", type = int, help = "Top-K", default = 20)
-parser.add_argument("-topp", "--top_p", type = float, help = "Top-P", default = 0.65)
-parser.add_argument("-minp", "--min_p", type = float, help = "Min-P", default = 0.00)
-parser.add_argument("-repp",  "--repetition_penalty", type = float, help = "Repetition penalty", default = 1.15)
-parser.add_argument("-repps", "--repetition_penalty_sustain", type = int, help = "Past length for repetition penalty", default = 256)
-parser.add_argument("-beams", "--beams", type = int, help = "Number of beams for beam search", default = 1)
-parser.add_argument("-beamlen", "--beam_length", type = int, help = "Number of future tokens to consider", default = 1)
-
-args = parser.parse_args()
-model_init.post_parse(args)
-model_init.get_model_files(args)
-
-# Some feedback
-
-print(f" -- Sequence length: {args.length}")
-print(f" -- Temperature: {args.temperature:.2f}")
-print(f" -- Top-K: {args.top_k}")
-print(f" -- Top-P: {args.top_p:.2f}")
-print(f" -- Min-P: {args.min_p:.2f}")
-print(f" -- Repetition penalty: {args.repetition_penalty:.2f}")
-print(f" -- Beams: {args.beams} x {args.beam_length}")
-
-print_opts = []
-if args.no_newline: print_opts.append("no_newline")
-if args.botfirst: print_opts.append("botfirst")
-
-model_init.print_options(args, print_opts)
-
-username = args.username
-bot_name = args.botname
-
-# Load prompt file
-
-if args.prompt is not None:
-    with open(args.prompt, "r") as f:
-        past = f.read()
-        past = past.replace("{username}", username)
-        past = past.replace("{bot_name}", bot_name)
-        past = past.strip() + "\n"
-else:
-    past = f"{bot_name}: Hello, {username}\n"
-
-# past += "User: Hi. Please say \"Shhhhhh\"?\n"
-# args.botfirst = True
-
-# Instantiate model and generator
-
-config = model_init.make_config(args)
-
-model = ExLlama(config)
-cache = ExLlamaCache(model)
-tokenizer = ExLlamaTokenizer(args.tokenizer)
-
-model_init.print_stats(model)
-
-generator = ExLlamaGenerator(model, tokenizer, cache)
-generator.settings = ExLlamaGenerator.Settings()
-generator.settings.temperature = args.temperature
-generator.settings.top_k = args.top_k
-generator.settings.top_p = args.top_p
-generator.settings.min_p = args.min_p
-generator.settings.token_repetition_penalty_max = args.repetition_penalty
-generator.settings.token_repetition_penalty_sustain = args.repetition_penalty_sustain
-generator.settings.token_repetition_penalty_decay = generator.settings.token_repetition_penalty_sustain // 2
-generator.settings.beams = args.beams
-generator.settings.beam_length = args.beam_length
-
-break_on_newline = not args.no_newline
-
-# Be nice to Chatbort
-
-min_response_tokens = 4
-max_response_tokens = 256
-extra_prune = 256
-
-print(past, end = "")
-ids = tokenizer.encode(past)
-generator.gen_begin(ids)
-
-next_userprompt = username + ": "
-
-first_round = True
-
-while True:
-
-    res_line = bot_name + ":"
-    res_tokens = tokenizer.encode(res_line)
-    num_res_tokens = res_tokens.shape[-1]  # Decode from here
-
-    if first_round and args.botfirst: in_tokens = res_tokens
-
-    else:
-
-        # Read and format input
-
-        in_line = input(next_userprompt)
-        in_line = username + ": " + in_line.strip() + "\n"
-
-        next_userprompt = username + ": "
-
-        # No need for this, really, unless we were logging the chat. The actual history we work on is kept in the
-        # tokenized sequence in the generator and the state in the cache.
-
-        past += in_line
-
-        # SentencePiece doesn't tokenize spaces separately so we can't know from individual tokens if they start a new word
-        # or not. Instead, repeatedly decode the generated response as it's being built, starting from the last newline,
-        # and print out the differences between consecutive decodings to stream out the response.
-
-        in_tokens = tokenizer.encode(in_line)
-        in_tokens = torch.cat((in_tokens, res_tokens), dim = 1)
-
-    # If we're approaching the context limit, prune some whole lines from the start of the context. Also prune a
-    # little extra so we don't end up rebuilding the cache on every line when up against the limit.
-
-    expect_tokens = in_tokens.shape[-1] + max_response_tokens
-    max_tokens = config.max_seq_len - expect_tokens
-    if generator.gen_num_tokens() >= max_tokens:
-        generator.gen_prune_to(config.max_seq_len - expect_tokens - extra_prune, tokenizer.newline_token_id)
-
-    # Feed in the user input and "{bot_name}:", tokenized
-
-    generator.gen_feed_tokens(in_tokens)
-
-    # Generate with streaming
-
-    print(res_line, end = "")
-    sys.stdout.flush()
-
-    generator.begin_beam_search()
-
-    for i in range(max_response_tokens):
-
-        # Disallowing the end condition tokens seems like a clean way to force longer replies.
-
-        if i < min_response_tokens:
-            generator.disallow_tokens([tokenizer.newline_token_id, tokenizer.eos_token_id])
-        else:
-            generator.disallow_tokens(None)
-
-        # Get a token
-
-        gen_token = generator.beam_search()
-
-        # If token is EOS, replace it with newline before continuing
-
-        if gen_token.item() == tokenizer.eos_token_id:
-            generator.replace_last_token(tokenizer.newline_token_id)
-
-        # Decode the current line and print any characters added
-
-        num_res_tokens += 1
-        text = tokenizer.decode(generator.sequence_actual[:, -num_res_tokens:][0])
-        new_text = text[len(res_line):]
-
-        skip_space = res_line.endswith("\n") and new_text.startswith(" ")  # Bit prettier console output
-        res_line += new_text
-        if skip_space: new_text = new_text[1:]
-
-        print(new_text, end="")  # (character streaming output is here)
-        sys.stdout.flush()
-
-        # End conditions
-
-        if break_on_newline and gen_token.item() == tokenizer.newline_token_id: break
-        if gen_token.item() == tokenizer.eos_token_id: break
-
-        # Some models will not (or will inconsistently) emit EOS tokens but in a chat sequence will often begin
-        # generating for the user instead. Try to catch this and roll back a few tokens to begin the user round.
-
-        if res_line.endswith(f"{username}:"):
-            plen = tokenizer.encode(f"{username}:").shape[-1]
-            generator.gen_rewind(plen)
-            next_userprompt = " "
-            break
-
-    generator.end_beam_search()
-
-    past += res_line
-    first_round = False
diff --git a/example_flask.py b/example_flask.py
deleted file mode 100644
index 58aca5fe..00000000
--- a/example_flask.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from flask import Flask, request
-from model import ExLlama, ExLlamaCache, ExLlamaConfig
-from tokenizer import ExLlamaTokenizer
-from generator import ExLlamaGenerator
-import os, glob
-
-# Directory containing config.json, tokenizer.model and safetensors file for the model
-model_directory = "/mnt/str/models/llama-7b-4bit/"
-
-tokenizer_path = os.path.join(model_directory, "tokenizer.model")
-model_config_path = os.path.join(model_directory, "config.json")
-st_pattern = os.path.join(model_directory, "*.safetensors")
-model_path = glob.glob(st_pattern)[0]
-
-config = ExLlamaConfig(model_config_path)               # create config from config.json
-config.model_path = model_path                          # supply path to model weights file
-
-model = ExLlama(config)                                 # create ExLlama instance and load the weights
-print(f"Model loaded: {model_path}")
-
-tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
-cache = ExLlamaCache(model)                             # create cache for inference
-generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
-
-# Flask app
-
-app = Flask(__name__)
-
-
-# Inference with settings equivalent to the "precise" preset from the /r/LocalLLaMA wiki
-
-@app.route('/infer_precise', methods=['POST'])
-def inferContextP():
-    print(request.form)
-    prompt = request.form.get('prompt')
-
-    generator.settings.token_repetition_penalty_max = 1.176
-    generator.settings.token_repetition_penalty_sustain = config.max_seq_len
-    generator.settings.temperature = 0.7
-    generator.settings.top_p = 0.1
-    generator.settings.top_k = 40
-    generator.settings.typical = 0.0    # Disabled
-
-    outputs = generator.generate_simple(prompt, max_new_tokens = 200)
-    return outputs
-
-
-# Inference with settings equivalent to the "creative" preset from the /r/LocalLLaMA wiki
-
-@app.route('/infer_creative', methods=['POST'])
-def inferContextC():
-    print(request.form)
-    prompt = request.form.get('prompt')
-
-    generator.settings.token_repetition_penalty_max = 1.1
-    generator.settings.token_repetition_penalty_sustain = config.max_seq_len
-    generator.settings.temperature = 0.72
-    generator.settings.top_p = 0.73
-    generator.settings.top_k = 0        # Disabled
-    generator.settings.typical = 0.0    # Disabled
-
-    outputs = generator.generate_simple(prompt, max_new_tokens = 200)
-    return outputs
-
-
-# Inference with settings equivalent to the "sphinx" preset from the /r/LocalLLaMA wiki
-
-@app.route('/infer_sphinx', methods=['POST'])
-def inferContextS():
-    print(request.form)
-    prompt = request.form.get('prompt')
-
-    generator.settings.token_repetition_penalty_max = 1.15
-    generator.settings.token_repetition_penalty_sustain = config.max_seq_len
-    generator.settings.temperature = 1.99
-    generator.settings.top_p = 0.18
-    generator.settings.top_k = 30
-    generator.settings.typical = 0.0    # Disabled
-
-    outputs = generator.generate_simple(prompt, max_new_tokens = 200)
-    return outputs
-
-
-# Start Flask app
-
-host = "0.0.0.0"
-port = 8004
-print(f"Starting server on address {host}:{port}")
-
-if __name__ == '__main__':
-    from waitress import serve
-    serve(app, host = host, port = port)
diff --git a/example_lora.py b/example_lora.py
deleted file mode 100644
index e47c34f5..00000000
--- a/example_lora.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from model import ExLlama, ExLlamaCache, ExLlamaConfig
-from tokenizer import ExLlamaTokenizer
-from generator import ExLlamaGenerator
-from lora import ExLlamaLora
-import os, glob
-import torch
-
-# Directory containt model, tokenizer, generator
-
-model_directory = "/mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-7B-4bit-128g/"
-
-# Directory containing LoRA config and weights
-
-lora_directory = "/mnt/str/models/_test_loras/tloen_alpaca-lora-7b/"
-
-# Locate files we need within those directories
-
-tokenizer_path = os.path.join(model_directory, "tokenizer.model")
-model_config_path = os.path.join(model_directory, "config.json")
-st_pattern = os.path.join(model_directory, "*.safetensors")
-model_path = glob.glob(st_pattern)[0]
-
-lora_config_path = os.path.join(lora_directory, "adapter_config.json")
-lora_path = os.path.join(lora_directory, "adapter_model.bin")
-
-# Create config, model, tokenizer and generator
-
-config = ExLlamaConfig(model_config_path)               # create config from config.json
-config.model_path = model_path                          # supply path to model weights file
-
-model = ExLlama(config)                                 # create ExLlama instance and load the weights
-tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
-
-cache = ExLlamaCache(model)                             # create cache for inference
-generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
-
-# Load LoRA
-
-lora = ExLlamaLora(model, lora_config_path, lora_path)
-
-# Configure generator
-
-generator.settings.token_repetition_penalty_max = 1.2
-generator.settings.temperature = 0.65
-generator.settings.top_p = 0.4
-generator.settings.top_k = 0
-generator.settings.typical = 0.0
-
-# Alpaca prompt
-
-prompt = \
-    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n" \
-    "\n" \
-    "### Instruction:\n" \
-    "List five colors in alphabetical order.\n" \
-    "\n" \
-    "### Response:"
-
-# Generate with LoRA
-
-print(" --- LoRA ----------------- ")
-print("")
-
-generator.lora = lora
-torch.manual_seed(1337)
-output = generator.generate_simple(prompt, max_new_tokens = 200)
-print(output)
-
-# Generate without LoRA
-
-print("")
-print(" --- No LoRA -------------- ")
-print("")
-
-generator.lora = None
-torch.manual_seed(1337)
-output = generator.generate_simple(prompt, max_new_tokens = 200)
-print(output)
-
diff --git a/exllama/generator.py b/exllama/generator.py
index 2d02a6da..4ba2f0a7 100644
--- a/exllama/generator.py
+++ b/exllama/generator.py
@@ -6,6 +6,7 @@
 import torch
 import torch.nn.functional as F
 
+DEFAULT_MAX_CHUNK = 2048
 
 class ExLlamaGenerator:
 
@@ -62,11 +63,27 @@ def make_rep_mask(self, penalty_max, sustain, decay):
         return cuda_ext.ext_rep_penalty_mask_cpu(self.model.config.vocab_size, self.sequence, penalty_max, sustain, decay)
 
 
+    def batched_sample(self, logits, temperature, top_k, top_p, min_p, typical, num = 1):
+
+        if logits.shape[0] == 1: return self.sample(logits, temperature, top_k, top_p, min_p, typical, num)
+
+        samples = []
+        scores = []
+        for i in range(logits.shape[0]):
+            t, s = self.sample(logits[i, :, :], temperature, top_k, top_p, min_p, typical)
+            samples.append(t)
+            scores.append(s)
+
+        return torch.cat(samples, dim = 0), torch.cat(scores, dim = 0)
+
+
     def sample(self, logits, temperature, top_k, top_p, min_p, typical, num = 1):
 
         # torch.manual_seed(42)
 
-        logits = logits[0, -1, :]
+        if logits.dim() == 3: logits = logits[0, -1, :]
+        elif logits.dim() == 2: logits = logits[-1, :]
+        else: raise ValueError("Bad logits dimension")
 
         # Disallow tokens
 
@@ -147,7 +164,7 @@ def disallow_tokens(self, tokens):
         self.disallowed_tokens = tokens
 
 
-    def gen_begin(self, in_tokens):
+    def gen_begin(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
 
         self.end_beam_search()
 
@@ -156,7 +173,11 @@ def gen_begin(self, in_tokens):
         self.cache.current_seq_len = 0
 
         if in_tokens.shape[-1] > 1:
-            self.model.forward(self.sequence[:, :-1], self.cache, preprocess_only = True, lora = self.lora)
+            a = 0
+            while a < self.sequence.shape[-1] - 1:
+                b = min(a + max_chunk, self.sequence.shape[-1] - 1)
+                self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora)
+                a = b
 
 
     def gen_begin_empty(self):
@@ -167,11 +188,11 @@ def gen_begin_empty(self):
         self.cache.current_seq_len = 0
 
 
-    def gen_begin_reuse(self, in_tokens):
+    def gen_begin_reuse(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
 
         self.end_beam_search()
         if self.sequence is None or self.cache.current_seq_len == 0:
-            self.gen_begin(in_tokens)
+            self.gen_begin(in_tokens, max_chunk)
             return 0
 
         # if in_tokens.shape[-1] < self.sequence.shape[-1]:
@@ -182,7 +203,7 @@ def gen_begin_reuse(self, in_tokens):
             reuse += 1
 
         if reuse < 2:
-            self.gen_begin(in_tokens)
+            self.gen_begin(in_tokens, max_chunk)
             return 0
 
         # print (f"Reusing cache: {reuse} tokens")
@@ -195,10 +216,10 @@ def gen_begin_reuse(self, in_tokens):
         return reuse
 
 
-    def gen_feed_tokens(self, in_tokens):
+    def gen_feed_tokens(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
 
         if self.sequence is None:
-            self.gen_begin(in_tokens)
+            self.gen_begin(in_tokens, max_chunk)
             return
 
         self.end_beam_search()
@@ -209,7 +230,12 @@ def gen_feed_tokens(self, in_tokens):
             self.sequence = in_tokens.clone()
         else:
             self.sequence = torch.cat((self.sequence, in_tokens), dim = 1)
-        self.model.forward(self.sequence[:, start:-1], self.cache, preprocess_only = True, lora = self.lora)
+
+        a = start
+        while a < self.sequence.shape[-1] - 1:
+            b = min(a + max_chunk, self.sequence.shape[-1] - 1)
+            self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora)
+            a = b
 
         self.sequence_actual = self.sequence
 
@@ -277,7 +303,7 @@ def gen_num_tokens(self):
         return self.sequence_actual.shape[-1]
 
 
-    # Generate some number of tokens and append to
+    # Simple generator function
 
     def generate_simple(self, prompt, max_new_tokens = 128):
 
@@ -286,17 +312,22 @@ def generate_simple(self, prompt, max_new_tokens = 128):
         ids = self.tokenizer.encode(prompt)
         self.gen_begin(ids)
 
+        max_new_tokens = min(max_new_tokens, self.model.config.max_seq_len - ids.shape[1])
+
+        eos = torch.zeros((ids.shape[0],), dtype = torch.bool)
         for i in range(max_new_tokens):
             token = self.gen_single_token()
-            if token.item() == self.tokenizer.eos_token_id: break
+            for j in range(token.shape[0]):
+                if token[j, 0].item() == self.tokenizer.eos_token_id: eos[j] = True
+            if eos.all(): break
 
-        text = self.tokenizer.decode(self.sequence[0])
+        text = self.tokenizer.decode(self.sequence[0] if self.sequence.shape[0] == 1 else self.sequence)
         return text
 
 
     # Generate a single token with the current settings, append to sequence
 
-    def gen_single_token(self, constraints = None, lora = None):
+    def gen_single_token(self, constraints = None):
 
         self.end_beam_search()
 
@@ -304,12 +335,14 @@ def gen_single_token(self, constraints = None, lora = None):
 
         if self.sequence is not None:
 
-            rep_mask = self.make_rep_mask(self.settings.token_repetition_penalty_max,
-                                          self.settings.token_repetition_penalty_sustain,
-                                          self.settings.token_repetition_penalty_decay)
-
             logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora)
-            logits /= rep_mask
+
+            cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence,
+                                                    self.settings.token_repetition_penalty_max,
+                                                    self.settings.token_repetition_penalty_sustain,
+                                                    self.settings.token_repetition_penalty_decay,
+                                                    logits)
+
             logits[:, :, self.tokenizer.bos_token_id] = -10000.0
 
             if constraints is not None:
@@ -317,12 +350,12 @@ def gen_single_token(self, constraints = None, lora = None):
                 for c in constraints: logits[:, :, c] += 10000.0
                 logits[:, :, :] -= 10000.0
 
-            token, _ = self.sample(logits,
-                                   self.settings.temperature,
-                                   self.settings.top_k,
-                                   self.settings.top_p,
-                                   self.settings.min_p + 0.01 if constraints is not None else 0.0,
-                                   self.settings.typical)
+            token, _ = self.batched_sample(logits,
+                                           self.settings.temperature,
+                                           self.settings.top_k,
+                                           self.settings.top_p,
+                                           self.settings.min_p + 0.01 if constraints is not None else 0.0,
+                                           self.settings.typical)
 
         else:
 
@@ -481,13 +514,14 @@ def beam_search(self):
 
                 # Initial tokens for initial beams
 
-                rep_mask = self.make_rep_mask(self.settings.token_repetition_penalty_max,
-                                              self.settings.token_repetition_penalty_sustain,
-                                              self.settings.token_repetition_penalty_decay)
-
                 # self.cache.debug()
                 logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora)
-                logits /= rep_mask
+
+                cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence,
+                                                        self.settings.token_repetition_penalty_max,
+                                                        self.settings.token_repetition_penalty_sustain,
+                                                        self.settings.token_repetition_penalty_decay,
+                                                        logits)
 
                 tokens, probs = self.sample(logits,
                                             self.settings.temperature,
@@ -515,13 +549,14 @@ def beam_search(self):
 
                     beam.to_sequence()
 
-                    rep_mask = self.make_rep_mask(self.settings.token_repetition_penalty_max,
-                                                  self.settings.token_repetition_penalty_sustain,
-                                                  self.settings.token_repetition_penalty_decay)
-
                     # self.cache.debug()
                     logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora)
-                    logits /= rep_mask
+
+                    cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence,
+                                                            self.settings.token_repetition_penalty_max,
+                                                            self.settings.token_repetition_penalty_sustain,
+                                                            self.settings.token_repetition_penalty_decay,
+                                                            logits)
 
                     tokens, probs = self.sample(logits,
                                                 self.settings.temperature,
diff --git a/exllama/lora.py b/exllama/lora.py
index f279af3b..9dd1dd34 100644
--- a/exllama/lora.py
+++ b/exllama/lora.py
@@ -15,6 +15,7 @@ class ExLlamaLora:
     lora_scaling: float
     config: ExLlamaConfig
     tensors: dict[torch.tensor]
+    bias_ignored: bool
 
     def __init__(self, model, lora_config_path, lora_path):
 
@@ -23,6 +24,7 @@ def __init__(self, model, lora_config_path, lora_path):
         self.model = model
         self.config = model.config
         self.tensors = {}
+        self.bias_ignored = False
 
         # Grab relevant items from LoRA config
 
@@ -58,6 +60,13 @@ def __init__(self, model, lora_config_path, lora_path):
             decoder_layer = ks[4]
             lora_half = ks[5]
 
+            if lora_half == "bias":
+                epsilon = 1e-6
+                if torch.max(tensor) > epsilon or torch.max(tensor) < -epsilon:
+                    raise ValueError(f" ## Error: unsupported bias target {self.lora_path}: {key}")
+                self.bias_ignored = True
+                continue
+
             target_module = self.model.layers[decoder_idx]
             if decoder_part == "self_attn": target_module = target_module.self_attn
             elif decoder_part == "mlp": target_module = target_module.mlp
diff --git a/exllama/model.py b/exllama/model.py
index efac3f8f..03a177bf 100644
--- a/exllama/model.py
+++ b/exllama/model.py
@@ -1,3 +1,10 @@
+import sys
+min_version = (3, 9)
+if sys.version_info < min_version:
+    print("")
+    print(f" ## Warning: this project requires Python {min_version[0]}.{min_version[1]} or higher.")
+    print("")
+
 import torch
 from torch import nn
 import torch.nn.functional as F
@@ -64,6 +71,7 @@ def __init__(self, model_config_path):
         # Optional settings
 
         self.max_seq_len = 2048  # Reduce to save memory. Can also be increased, but the pretrained models produce degenerate output after 2048 tokens in any case. Should be possible to finetune for longer sequence lengths.
+        self.compress_pos_emb = 1.0  # Increase to compress positional embeddings applied to sequence
         self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to move tensors via system RAM instead, where needed
         self.auto_map = None  # List of floats with memory allocation in GB, per CUDA device, overrides device_map
 
@@ -304,9 +312,9 @@ def fused(self, hidden_states, cache, buffer, input_layernorm, lora):
 
         # Project q, k, v, apply position embeddings to k and v, update cache
 
-        query_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
-        key_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
-        value_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
+        query_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
+        key_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
+        value_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device)
 
         cuda_ext.exllama_ext.q4_attn(hidden_states,
                                      input_layernorm.weight,
@@ -398,8 +406,7 @@ def forward(self, hidden_states, cache, buffer, lora):
 
             attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
             attn_weights /= math.sqrt(self.config.head_dim)
-            if buffer.attn_mask is not None and buffer.attn_mask.shape[2] > 1: attn_weights = attn_weights + buffer.attn_mask
-            # attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            if buffer.attn_mask is not None: attn_weights = attn_weights + buffer.attn_mask
             attn_weights = nn.functional.softmax(attn_weights, dim = -1, dtype = torch.float16).to(query_states.dtype)
             attn_output = torch.matmul(attn_weights, value_states)
             attn_output = attn_output.transpose(1, 2)
@@ -412,7 +419,7 @@ def forward(self, hidden_states, cache, buffer, lora):
             # it can only apply a square attention mask. It saves quite a bit of VRAM but in practice Torch seems to use
             # the same amount of memory at peak anyway.
 
-            if past_len > 0:
+            if past_len > 0 or (bsz > 1 and buffer.attn_mask is not None):
                 attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = buffer.attn_mask, is_causal = False)
             else:
                 attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = None, is_causal = True)
@@ -616,7 +623,7 @@ def _move_tensor(tensor, new_device, name, config):
     device = str(tensor.device)
     if device == new_device: return tensor
     if config.gpu_peer_fix:
-        if device.startswith("cuda:") and new_device.startswith("cuda:"):
+        if str(device).startswith("cuda:") and str(new_device).startswith("cuda:"):
             tensor = tensor.to("cpu")
     return tensor.to(new_device)
 
@@ -740,6 +747,8 @@ def __init__(self, config):
 
             inv_freq = 1.0 / (self.config.rotary_embedding_base ** (torch.arange(0, self.config.head_dim, 2, device = device).float() / self.config.head_dim))
             t = torch.arange(self.config.max_seq_len, device = device, dtype = torch.float32)
+            if self.config.compress_pos_emb != 1.0: t /= self.config.compress_pos_emb
+
             freqs = torch.einsum("i,j->ij", t, inv_freq)
             emb = torch.cat((freqs, freqs), dim = -1)
 
@@ -793,7 +802,9 @@ def __init__(self, config):
         torch.cuda.empty_cache()
 
 
-    def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None):
+    def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None, output_device = None, input_mask = None):
+
+        assert input_mask is None or input_mask.shape == input_ids.shape
 
         # if torch.is_grad_enabled():
         #     raise ValueError("Forward pass called with gradients enabled. Back propagation is not supported yet.")
@@ -801,6 +812,7 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
 
             batch_size, seq_len = input_ids.shape
             past_len = cache.current_seq_len
+            if output_device is None: output_device = input_ids.device
 
             buffer = ExLlamaBuffer(self.config)
 
@@ -811,9 +823,16 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
             if seq_len > 1:
 
                 attn_mask = torch.zeros(batch_size, 1, seq_len, past_len + seq_len, dtype = torch.float16, device = devs[0])
-                attn_mask_triu = torch.triu(torch.full((seq_len - 1, seq_len - 1), torch.finfo(torch.float16).min))
+                attn_mask_triu = torch.triu(torch.full((seq_len - 1, seq_len - 1), -65504.))
                 attn_mask[:, :, : seq_len - 1, past_len + 1: past_len + seq_len] = attn_mask_triu
 
+                if input_mask is not None:
+
+                    input_mask = _move_tensor(input_mask, devs[0], "input_mask", self.config)
+                    input_mask = torch.where(input_mask, 0, -65504.).half()
+                    input_mask = input_mask.unsqueeze(1).unsqueeze(2)
+                    attn_mask = torch.minimum(attn_mask, input_mask)
+
             else:
 
                 attn_mask = None
@@ -824,7 +843,7 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
             # Embeddings
             # TODO: Allow passing input embeddings instead of IDs
 
-            input_ids = _move_tensor(input_ids, "cpu", "input_ids", self.config)
+            input_ids = _move_tensor(input_ids, self.config.device_map.embed_tokens, "input_ids", self.config)
             hidden_states = self.embed_tokens(input_ids)
 
             # Split buffers to devices
@@ -863,5 +882,15 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False
             # logits = cuda_ext.matmul_half(hidden_states, self.lm_head_data, cublas = False)
 
             logits = logits.float()
-            logits = _move_tensor(logits, self.config.device_map.embed_tokens, "logits", self.config)
+            logits = _move_tensor(logits, output_device, "logits", self.config)
             return logits
+
+
+    # Free unmanaged resources allocated by the C++ extension. Call this before dereferencing the ExLlama object,
+    # e.g. if you intend to create a new instance to load another model, but don't call it in a destructor that wraps
+    # the object, since it relies on CUDA function calls and the CUDA context is one of the first things to go when
+    # a PyTorch application terminates, before other managed objects are destroyed.
+
+    def free_unmanaged(self):
+
+        cuda_ext.exllama_ext.cleanup()
diff --git a/exllama/tokenizer.py b/exllama/tokenizer.py
index b17e9f2c..c6964a3e 100644
--- a/exllama/tokenizer.py
+++ b/exllama/tokenizer.py
@@ -10,19 +10,52 @@ def __init__(self, tokenizer_model_path):
         self.tokenizer = SentencePieceProcessor(model_file = self.path)
         self.eos_token_id = self.tokenizer.eos_id()
         self.bos_token_id = self.tokenizer.bos_id()
+        self.pad_token_id = 0
         self.newline_token_id = 13
 
+    # Encode string
+
     def encode(self, text):
 
-        ids = self.tokenizer.Encode(text)
-        return torch.tensor(ids).unsqueeze(0)
+        if isinstance(text, list):
+
+            # text is a list of strings
+
+            list_ids = self.tokenizer.Encode(text)
+            max_length = max([len(ids) for ids in list_ids])
+
+            padded_ids = []
+            for ids in list_ids:
+                padding = torch.full((max_length - len(ids),), self.pad_token_id)
+                sequence = torch.tensor(ids)
+                padded_ids.append(torch.cat((padding, sequence), dim = 0))
+
+            return torch.stack(padded_ids, dim = 0)
+
+        else:
+
+            # text is a single string
+
+            ids = self.tokenizer.Encode(text)
+            return torch.tensor(ids).unsqueeze(0)
 
     def decode(self, ids):
 
-        if not isinstance(ids, list):
+        if ids.dim() > 1:
+
+            texts = []
+            for i in range(ids.shape[0]):
+                seq = ids[i].tolist()
+                seq = [t for t in seq if t != self.pad_token_id]
+                if self.eos_token_id in seq: seq = seq[:seq.index(self.eos_token_id)]
+                texts.append(self.tokenizer.Decode(seq))
+            return texts
+
+        else:
+
             ids = ids.tolist()
-        text = self.tokenizer.Decode(ids)
-        return text
+            text = self.tokenizer.Decode(ids)
+            return text
 
     def num_tokens(self, text):
 
diff --git a/exllama_ext/cpu_func/rep_penalty.cpp b/exllama_ext/cpu_func/rep_penalty.cpp
index c9ffe0b7..26c9dc16 100644
--- a/exllama_ext/cpu_func/rep_penalty.cpp
+++ b/exllama_ext/cpu_func/rep_penalty.cpp
@@ -1,4 +1,6 @@
 #include "rep_penalty.h"
+#include <cstdlib>
+#include <cstring>
 
 void rep_penalty_cpu
 (
@@ -26,4 +28,47 @@ void rep_penalty_cpu
         if (v > rep_mask[t]) rep_mask[t] = v;
         if (--s < 0) v += dv;
     }
+}
+
+bool* g_rep_mask = NULL;
+int g_vocab_size = 0;
+
+void apply_rep_penalty_cpu
+(
+    const int vocab_size,
+    const uint64_t* sequence,
+    const float penalty_max,
+    const int sustain,
+    const int decay,
+    const int seq_len,
+    float* logits
+)
+{
+    if (vocab_size != g_vocab_size)
+    {
+        if (g_rep_mask) free(g_rep_mask);
+        g_vocab_size = vocab_size;
+        g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool));
+    }
+
+    memset(g_rep_mask, 0, g_vocab_size * sizeof(bool));
+
+    float v = penalty_max;
+    float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f;
+
+    int s = sustain == -1 ? seq_len : sustain;
+    int beg = seq_len - sustain - decay;
+    if (beg < 0) beg = 0;
+
+    for (int i = seq_len; i > beg;)
+    {
+        uint64_t t = sequence[--i];
+        if (!g_rep_mask[t])
+        {
+            if (logits[t] > 0.0) logits[t] /= v;
+            else logits[t] *= v;
+            g_rep_mask[t] = true;
+        }
+        if (--s < 0) v += dv;
+    }
 }
\ No newline at end of file
diff --git a/exllama_ext/cpu_func/rep_penalty.h b/exllama_ext/cpu_func/rep_penalty.h
index 895ddd93..4f63b484 100644
--- a/exllama_ext/cpu_func/rep_penalty.h
+++ b/exllama_ext/cpu_func/rep_penalty.h
@@ -15,4 +15,16 @@ void rep_penalty_cpu
     const int seq_len
 );
 
+void apply_rep_penalty_cpu
+(
+    const int vocab_size,
+    const uint64_t* sequence,
+    const float penalty_max,
+    const int sustain,
+    const int decay,
+    const int seq_len,
+    float* logits
+);
+
+
 #endif
diff --git a/exllama_ext/cuda_buffers.cu b/exllama_ext/cuda_buffers.cu
index 7e7bd75d..73146958 100644
--- a/exllama_ext/cuda_buffers.cu
+++ b/exllama_ext/cuda_buffers.cu
@@ -35,6 +35,12 @@ CudaBuffers::CudaBuffers
 
 CudaBuffers::~CudaBuffers()
 {
+    cudaStreamDestroy(alt_stream_1);
+    cudaStreamDestroy(alt_stream_2);
+    cudaStreamDestroy(alt_stream_3);
+    cudaEventDestroy(alt_stream_1_done);
+    cudaEventDestroy(alt_stream_2_done);
+    cudaEventDestroy(alt_stream_3_done);
 }
 
 float* CudaBuffers::get_zeros_float(const int num_zeros)
@@ -76,23 +82,14 @@ void prepare_buffers_cuda
     );
 
     g_buffers[_device] = buffers;
+}
 
-//     if (!q4_table_init)
-//     {
-//         for (uint v_zero = 0; v_zero < 16; v_zero++)
-//         {
-//             for (uint v_read = 0; v_read < 256; v_read++)
-//             {
-//                 half v_0 = __float2half((float)((int)((v_read      ) & 0x0f) - v_zero - 1));
-//                 half v_1 = __float2half((float)((int)((v_read >>  4) & 0x0f) - v_zero - 1));
-//                 half2 v_01 = {v_0, v_1};
-//                 q4_table_host[v_zero][v_read] = v_01;
-//             }
-//         }
-//         q4_table_init = true;
-//     }
-//
-//     cudaSetDevice(_device);
-//     cudaMemcpyToSymbol(q4_table, q4_table_host, 16 * 256 * sizeof(half2));
-//     cudaDeviceSynchronize();
+void cleanup_buffers_cuda()
+{
+    for (int i = 0; i < CUDA_MAX_DEVICES; i++)
+    {
+        if (!g_buffers[i]) continue;
+        delete g_buffers[i];
+        g_buffers[i] = NULL;
+    }
 }
diff --git a/exllama_ext/cuda_buffers.cuh b/exllama_ext/cuda_buffers.cuh
index 86e8af82..8b3d451d 100644
--- a/exllama_ext/cuda_buffers.cuh
+++ b/exllama_ext/cuda_buffers.cuh
@@ -58,4 +58,6 @@ void prepare_buffers_cuda
     int _max_zeros_float
 );
 
+void cleanup_buffers_cuda();
+
 #endif
\ No newline at end of file
diff --git a/exllama_ext/cuda_func/half_matmul.cu b/exllama_ext/cuda_func/half_matmul.cu
index 6f62d9a4..76ee1e43 100644
--- a/exllama_ext/cuda_func/half_matmul.cu
+++ b/exllama_ext/cuda_func/half_matmul.cu
@@ -130,7 +130,7 @@ const int S_THREADS_X = 8;                                      // width
 const int S_THREADS_Z = 1;                                      // height
 const int S_BLOCKSIZE = MAX_DIM_SMALL / 1024 * S_THREADS_X;     // dim
 
-template<bool use_half2>
+template<bool use_half2, bool odd_rank>
 __global__ void half_matmul_small_kernel
 (
     const half* __restrict__ x,
@@ -163,7 +163,7 @@ __global__ void half_matmul_small_kernel
     const half* w_ptr = w_.item_ptr(k, column);
     half* out_ptr = out_.item_ptr(row, column);
 
-    if constexpr (use_half2)
+    if constexpr (use_half2 && !odd_rank)
     {
         half2* x_ptr2 = (half2*) x_ptr;
         half2* x_ptr2_end = (half2*) x_ptr_end;
@@ -204,13 +204,22 @@ __global__ void half_matmul_small_kernel
 
         while(x_ptr < x_ptr_end)
         {
-            #pragma unroll
-            for (int i = 0; i < 4; ++i)
+            if constexpr (odd_rank)
             {
                 half x_item = *x_ptr++;
                 half w_item = *w_ptr; w_ptr += width;
                 r = __hfma(x_item, w_item, r);
             }
+            else
+            {
+                #pragma unroll
+                for (int i = 0; i < 4; ++i)
+                {
+                    half x_item = *x_ptr++;
+                    half w_item = *w_ptr; w_ptr += width;
+                    r = __hfma(x_item, w_item, r);
+                }
+            }
         }
 
         __shared__ half accum[MAX_DIM_SMALL / S_BLOCKSIZE][S_THREADS_X];
@@ -242,7 +251,7 @@ void half_matmul_small_cuda
 {
     bool use_half2 = !tuningParams->matmul_no_half2;
 
-    // printf("kernel: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width);
+    //printf("kernel: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width);
 
     dim3 threads
     (
@@ -258,11 +267,18 @@ void half_matmul_small_cuda
         height
     );
 
-    // printf("t... %i %i %i\n", threads.x, threads.y, threads.z);
-    // printf("b... %i %i %i\n", blocks.x, blocks.y, blocks.z);
+    //printf("t... %i %i %i\n", threads.x, threads.y, threads.z);
+    //printf("b... %i %i %i\n", blocks.x, blocks.y, blocks.z);
     //if (!no_zero) cudaMemsetAsync(out, 0, height * width * sizeof(half));
 
-    if (use_half2) half_matmul_small_kernel<true> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
-    else           half_matmul_small_kernel<false><<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
+    if (dim & 0x03)
+    {
+        half_matmul_small_kernel<false, true> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
+    }
+    else
+    {
+        if (use_half2) half_matmul_small_kernel<true,  false> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
+        else           half_matmul_small_kernel<false, false> <<<blocks, threads, 0, alt_stream>>>(x, w, out, height, dim, width, no_zero);
+    }
 }
 
diff --git a/exllama_ext/cuda_func/q4_attn.cu b/exllama_ext/cuda_func/q4_attn.cu
index 54336962..5c6e1be4 100644
--- a/exllama_ext/cuda_func/q4_attn.cu
+++ b/exllama_ext/cuda_func/q4_attn.cu
@@ -87,6 +87,7 @@ void q4_attn_cuda
     Q4Matrix* v_proj,
     half* sin,
     half* cos,
+    const int bsz,
     const int q_len,
     const int dim,
     const int head_dim,
@@ -119,7 +120,7 @@ void q4_attn_cuda
         ((num_heads + THREADS_Z - 1) / THREADS_Z + BLOCKSIZE_Z - 1) / BLOCKSIZE_Z
     );
 
-    int _rows = q_len * num_heads;
+    int _rows_per_batch = q_len * num_heads;
 
     CudaBuffers* buffers = get_buffers(device_index);
 
@@ -156,8 +157,8 @@ void q4_attn_cuda
 
         // Positional embeddings q, k
 
-        rope_cuda(tuningParams, query_states, sin, cos, _rows, head_dim, num_heads, past_len);
-        rope_cuda(tuningParams, key_states, sin, cos, _rows, head_dim, num_heads, past_len);
+        rope_cuda(tuningParams, query_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len);
+        rope_cuda(tuningParams, key_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len);
 
         // Update cache tensors with projected k, v
 
@@ -177,13 +178,13 @@ void q4_attn_cuda
         // str_1: project q, positions q, sync
 
         q4_matmul_cuda(tuningParams, temp_x, q_len, q_proj, query_states, q_a ? true : false, str_1);
-        rope_cuda(tuningParams, query_states, sin, cos, _rows, head_dim, num_heads, past_len, str_1);
+        rope_cuda(tuningParams, query_states, sin, cos,  bsz, _rows_per_batch, head_dim, num_heads, past_len, str_1);
         cudaEventRecord(sync_1, str_1);
 
         // str_2: project k, positions k, sync
 
         q4_matmul_cuda(tuningParams, temp_x, q_len, k_proj, key_states, k_a ? true : false, str_2);
-        rope_cuda(tuningParams, key_states, sin, cos, _rows, head_dim, num_heads, past_len, str_2);
+        rope_cuda(tuningParams, key_states, sin, cos,  bsz, _rows_per_batch, head_dim, num_heads, past_len, str_2);
         cudaEventRecord(sync_2, str_2);
 
         // str_3: project v, wait for str_2, copy (k,v) to cache, sync
diff --git a/exllama_ext/cuda_func/q4_attn.cuh b/exllama_ext/cuda_func/q4_attn.cuh
index 7a6ae9ae..ce2e29fa 100644
--- a/exllama_ext/cuda_func/q4_attn.cuh
+++ b/exllama_ext/cuda_func/q4_attn.cuh
@@ -24,6 +24,7 @@ void q4_attn_cuda
     Q4Matrix* v_proj,
     half* sin,
     half* cos,
+    const int bsz,
     const int q_len,
     const int dim,
     const int head_dim,
diff --git a/exllama_ext/cuda_func/q4_matmul.cu b/exllama_ext/cuda_func/q4_matmul.cu
index 04047617..fe2343f7 100644
--- a/exllama_ext/cuda_func/q4_matmul.cu
+++ b/exllama_ext/cuda_func/q4_matmul.cu
@@ -245,6 +245,10 @@ void q4_matmul_recons_cuda
 
     const half alpha = __float2half(1.0f);
     const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
-
     cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
+
+//     const float alpha = 1.0f;
+//     const float beta = no_zero ? 1.0f : 0.0f;
+//     cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
+//                 x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
 }
diff --git a/exllama_ext/cuda_func/q4_matrix.cu b/exllama_ext/cuda_func/q4_matrix.cu
index e856a978..d02fa0ec 100644
--- a/exllama_ext/cuda_func/q4_matrix.cu
+++ b/exllama_ext/cuda_func/q4_matrix.cu
@@ -20,6 +20,7 @@ void g_q4_keep_matrix(Q4Matrix* m)
 void g_q4_free_matrices()
 {
     for (const auto& m : g_q4_matrices) delete m;
+    g_q4_matrices.clear();
 }
 
 Q4Matrix::Q4Matrix
diff --git a/exllama_ext/cuda_func/rope.cu b/exllama_ext/cuda_func/rope.cu
index 5179553c..ef64e9da 100644
--- a/exllama_ext/cuda_func/rope.cu
+++ b/exllama_ext/cuda_func/rope.cu
@@ -23,13 +23,14 @@ __global__ void rope_cuda_kernel
     half* __restrict__ x,
     const half* __restrict__ sin,
     const half* __restrict__ cos,
-    int rows,
+    int rows_per_batch,
     int head_dim,
     int num_heads,
     int past_len
 )
 {
-    MatrixView_half_rw x_(x, rows, head_dim);
+    // These heights aren't used so it's okay if they're wrong.
+    MatrixView_half_rw x_(x, rows_per_batch, head_dim);
     MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim);
     MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim);
 
@@ -37,7 +38,9 @@ __global__ void rope_cuda_kernel
 
     int column = (blockIdx.x * THREADS_X + threadIdx.x); if constexpr (use_half2) column *= 2;
     int row = blockIdx.y * THREADS_Y + threadIdx.y;
-    if (row >= rows) return;
+    if (row >= rows_per_batch) return;
+    int batch_offset = blockIdx.z * rows_per_batch;
+    int row_offset = batch_offset + row;
 
     // Get sin and cos
 
@@ -54,14 +57,14 @@ __global__ void rope_cuda_kernel
 
         // Apply embedding to row
 
-        half2 item2_l = x_.item_half2(row, column);
-        half2 item2_r = x_.item_half2(row, column + half_dim);
+        half2 item2_l = x_.item_half2(row_offset, column);
+        half2 item2_r = x_.item_half2(row_offset, column + half_dim);
         half2 item2_ls = __hmul2(item2_r, sin2_l);
         half2 item2_rs = __hmul2(item2_l, sin2_r);
         item2_l = __hfma2(item2_l, cos2_l, item2_ls);
         item2_r = __hfma2(item2_r, cos2_r, item2_rs);
-        x_.set_half2(row, column, item2_l);
-        x_.set_half2(row, column + half_dim, item2_r);
+        x_.set_half2(row_offset, column, item2_l);
+        x_.set_half2(row_offset, column + half_dim, item2_r);
     }
     else
     {
@@ -73,14 +76,14 @@ __global__ void rope_cuda_kernel
 
         // Apply embedding to row
 
-        half item_l = x_.item(row, column);
-        half item_r = x_.item(row, column + half_dim);
+        half item_l = x_.item(row_offset, column);
+        half item_r = x_.item(row_offset, column + half_dim);
         half item_ls = __hmul(item_r, sin_l);
         half item_rs = __hmul(item_l, sin_r);
         item_l = __hfma(item_l, cos_l, item_ls);
         item_r = __hfma(item_r, cos_r, item_rs);
-        x_.set(row, column, item_l);
-        x_.set(row, column + half_dim, item_r);
+        x_.set(row_offset, column, item_l);
+        x_.set(row_offset, column + half_dim, item_r);
     }
 }
 
@@ -100,7 +103,8 @@ void rope_cuda
     half* x,
     const half* sin,
     const half* cos,
-    const int rows,
+    const int bsz,
+    const int rows_per_batch,
     const int head_dim,
     const int num_heads,
     const int past_len,
@@ -112,10 +116,10 @@ void rope_cuda
     dim3 blocks
     (
         (head_dim + THREADS_X - 1) / THREADS_X / 2 / (tuningParams->rope_no_half2 ? 1 : 2),
-        (rows + THREADS_Y - 1) / THREADS_Y,
-        1
+        (rows_per_batch + THREADS_Y - 1) / THREADS_Y,
+        int(bsz)
     );
 
     fp_rope_cuda_kernel kernel = rope_cuda_kernel_pick(tuningParams);
-    kernel<<<blocks, threads, 0, alt_stream>>>(x, sin, cos, rows, head_dim, num_heads, past_len);
+    kernel<<<blocks, threads, 0, alt_stream>>>(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len);
 }
diff --git a/exllama_ext/cuda_func/rope.cuh b/exllama_ext/cuda_func/rope.cuh
index 64d5b88b..a0ffd33f 100644
--- a/exllama_ext/cuda_func/rope.cuh
+++ b/exllama_ext/cuda_func/rope.cuh
@@ -13,6 +13,7 @@ void rope_cuda
     half* x,
     const half* sin,
     const half* cos,
+    const int bsz,
     const int rows,
     const int head_dim,
     const int num_heads,
diff --git a/exllama_ext/exllama_ext.cpp b/exllama_ext/exllama_ext.cpp
index 280cf79c..615f7f4f 100644
--- a/exllama_ext/exllama_ext.cpp
+++ b/exllama_ext/exllama_ext.cpp
@@ -110,6 +110,16 @@ void set_tuning_params
     tuningParams.concurrent_streams = concurrent_streams;
 }
 
+
+// Release all unmanaged objects allocated by the extension
+
+void cleanup()
+{
+    cleanup_buffers_cuda();
+    g_q4_free_matrices();
+}
+
+
 // Prepare buffers for forward pass
 
 void prepare_buffers
@@ -408,12 +418,12 @@ void half_matmul_cublas
 
 void q4_attn
 (
-    torch::Tensor x,                // shape == (q_len, dim)
+    torch::Tensor x,                // shape == (bsz, q_len, dim)
     torch::Tensor rms_norm_weight,  // shape == (x.shape[1],) == (dim,)
     float epsilon,
-    torch::Tensor query_states,     // shape == (q_len, dim)
-    torch::Tensor key_states,       // shape == (q_len, dim)
-    torch::Tensor value_states,     // shape == (q_len, dim)
+    torch::Tensor query_states,     // shape == (bsz, q_len, dim)
+    torch::Tensor key_states,       // shape == (bsz, q_len, dim)
+    torch::Tensor value_states,     // shape == (bsz, q_len, dim)
     uintptr_t q_proj,
     uintptr_t k_proj,
     uintptr_t v_proj,
@@ -438,7 +448,8 @@ void q4_attn
     TORCH_CHECK_DTYPE(query_states, kHalf);
     TORCH_CHECK_DTYPE(key_states, kHalf);
 
-    int dim = query_states.size(1);
+    int bsz = query_states.size(0);
+    int dim = query_states.size(2);
 
     torch::Device device = x.device();
     int device_index = device.index();
@@ -467,6 +478,7 @@ void q4_attn
         reinterpret_cast<Q4Matrix*>(v_proj),
         (half*) sin.data_ptr(),
         (half*) cos.data_ptr(),
+        bsz,
         q_len,
         dim,
         head_dim,
@@ -638,7 +650,8 @@ void rope_
     TORCH_CHECK(head_dim == cos.size(-1), "cos table does not match head_dim");
     TORCH_CHECK(head_dim == sin.size(-1), "sin table does not match head_dim");
 
-    int rows = x.numel() / head_dim;
+    int bsz = x.size(0);
+    int rows_per_batch = x.numel() / head_dim / bsz;
 
     const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
 
@@ -648,7 +661,8 @@ void rope_
         (half*) x.data_ptr(),
         (half*) sin.data_ptr(),
         (half*) cos.data_ptr(),
-        rows,
+        bsz,
+        rows_per_batch,
         head_dim,
         num_heads,
         past_len
@@ -672,6 +686,8 @@ void rep_penalty
     int vocab_size = rep_mask.size(0);
     int seq_len = sequence.size(-1);
 
+    // TODO: Support batch size
+
     rep_penalty_cpu
     (
         vocab_size,
@@ -684,10 +700,43 @@ void rep_penalty
     );
 }
 
+void apply_rep_penalty
+(
+    torch::Tensor sequence,
+    float penalty_max,
+    int sustain,
+    int decay,
+    torch::Tensor logits
+)
+{
+    TORCH_CHECK_DTYPE(sequence, kLong);
+    TORCH_CHECK_DTYPE(logits, kFloat);
+    TORCH_CHECK_SHAPES(sequence, 0, logits, 0, 1);
+
+    int vocab_size = logits.size(-1);
+    int bsz = sequence.size(0);
+    int seq_len = sequence.size(-1);
+
+    for (int i = 0; i < bsz; i++)
+    {
+        apply_rep_penalty_cpu
+        (
+            vocab_size,
+            ((uint64_t*) sequence.data_ptr()) + i * seq_len,
+            penalty_max,
+            sustain,
+            decay,
+            seq_len,
+            ((float*) logits.data_ptr()) + i * vocab_size
+        );
+    }
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("set_tuning_params", &set_tuning_params, "set_tuning_params");
     m.def("prepare_buffers", &prepare_buffers, "prepare_buffers");
+    m.def("cleanup", &cleanup, "cleanup");
     m.def("make_q4", &make_q4, "make_q4");
     m.def("q4_matmul", &q4_matmul, "q4_matmul");
     m.def("q4_matmul_lora", &q4_matmul_lora, "q4_matmul_lora");
@@ -700,5 +749,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("half_matmul", &half_matmul, "half_matmul");
     m.def("half_matmul_cublas", &half_matmul_cublas, "half_matmul_cublas");
 
-    m.def("rep_penalty", &rep_penalty, "repetition penalty mask");
+    m.def("rep_penalty", &rep_penalty, "rep_penalty");
+    m.def("apply_rep_penalty", &apply_rep_penalty, "apply_rep_penalty");
 }
diff --git a/model_init.py b/model_init.py
index 7238a461..d61dbac9 100644
--- a/model_init.py
+++ b/model_init.py
@@ -12,6 +12,8 @@ def add_args(parser):
 
     parser.add_argument("-gs", "--gpu_split", type = str, help = "Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. -gs 20,7,7")
     parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048)
+    parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0)
+
     parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs")
 
     parser.add_argument("-mmrt", "--matmul_recons_thd", type = int, help = "No. rows at which to use reconstruction and cuBLAS for quant matmul. 0 = never, 1 = always", default = 8)
@@ -74,6 +76,8 @@ def print_options(args, extra_options = None):
     print(f" -- Model config: {args.config}")
     print(f" -- Model: {args.model}")
     print(f" -- Sequence length: {args.length}")
+    if args.compress_pos_emb != 1.0:
+        print(f" -- RoPE compression factor: {args.compress_pos_emb}")
 
     print(f" -- Tuning:")
     print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else ""))
@@ -98,6 +102,7 @@ def make_config(args):
     config.model_path = args.model
 
     config.max_seq_len = args.length
+    config.compress_pos_emb = args.compress_pos_emb
     config.set_auto_map(args.gpu_split)
     config.gpu_peer_fix = args.gpu_peer_fix
 
diff --git a/perplexity.py b/perplexity.py
index 593df6c1..84560998 100644
--- a/perplexity.py
+++ b/perplexity.py
@@ -32,54 +32,62 @@ def _begin(self):
             self.cache.current_seq_len = 0
 
 
-    def _next_logits(self, input_ids, apply_lora, last_id_only=True):
-        n_logits = None
+    def _next_logits(self, input_ids, apply_lora, last_id_only = True):
+        n_logits = []
         a = 0
         while a < input_ids.shape[-1]:
-            b = min(input_ids.shape[-1], a + 2048)
-            n_logits = self.model.forward(input_ids[:, a:b], self.cache, last_id_only, lora = apply_lora)
+            b = min(input_ids.shape[-1], a + 2048)  # TODO: Should this be a config parameter?
+            n_logits.append(self.model.forward(input_ids[:, a:b], self.cache, last_id_only, lora = apply_lora))
             a = b
 
-        return n_logits
+        return torch.cat(n_logits, dim = 1)
 
 
     def _tokenize(self, text):
         return self.tokenizer.encode(text)
 
 
-    # This loads *and* tokenizes into chunks
-    def load(self, dataset_path, context=2048, overlap=0, minlength = 0):
+    # Load raw dataset from a text file and tokenize into chunks. Each chunk can optionally truncated to allow for
+    # evaluating the same data at different sequence lengths
+
+    def load(self, dataset_path, chunk_size, chunk_truncate = None, overlap = 0, minlength = 0, json_key = "text"):
+
         file_extension = os.path.splitext(dataset_path)[1]
 
-        # JSON format
+        # JSON format: Returned chunks may be of variable length, with each chunk representing one list item
+
         if file_extension == '.jsonl' or file_extension == '.json':
             with open(dataset_path) as f:
                 for line in f:
-                    example = json.loads(line)["text"]
+                    example = json.loads(line)[json_key]
                     if len(example) > minlength:
                         chunk = self._tokenize(example)
-                        chunk = chunk[:, :context + 1]
+                        chunk = chunk[:, :chunk_size]
+                        if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate]
                         self.dataset_chunks.append(chunk)
-        # Raw Text
+
+        # Raw Text: Returned chunks are fixed length windows of the entire tokenized dataset
+
         else:
-            with open(dataset_path) as f:
+            with open(dataset_path, encoding="utf-8") as f:
                 text = f.read()
 
             tokens = self._tokenize(text)
 
             # overlap shouldn't be bigger than the context, also need at least one token for predicting last...
-            if overlap >= context:
-                overlap = context-2
+            if overlap >= chunk_size:
+                overlap = chunk_size-2
 
             # We can't use torch.chunks since it want's to split things into equal sized chunks. Instead, let's do our own chunking
             start = 0
             while start < tokens.size(1):
-                chunk = tokens[:, start:start+context]
-                start += context - overlap
+                chunk = tokens[:, start:start + chunk_size]
+                start += chunk_size - overlap
+                if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate]
                 self.dataset_chunks.append(chunk)
 
 
-    def test(self, chunk_limit=sys.maxsize, lora = None, tag="", ppl_token = False):
+    def test(self, chunk_limit = sys.maxsize, lora = None, tag = "", ppl_token = False):
         if not self.dataset_chunks:
             sys.exit(" xx ERROR: Empty dataset!")
 
@@ -126,3 +134,45 @@ def test(self, chunk_limit=sys.maxsize, lora = None, tag="", ppl_token = False):
 
         print("")
         print(f" ** Perplexity{tag}: {perplexity:.4f}")
+
+
+def add_args(parser):
+
+    parser.add_argument("-ppl", "--perplexity", nargs = '?', const = 'default', metavar = "METHOD", help = "Perplexity benchmark. Optionally specify method: gptq-for-llama, llama.cpp (not yet implemented)")
+    parser.add_argument("-ppl_ds", "--perplexity_dataset", metavar = "DATAPATH", type = str, help = "Load dataset for perplexity (JSONL if .jsonl, otherwise parses it as raw text)")
+    parser.add_argument("-ppl_cn", "--perplexity_chunk_num", nargs = "?", type = int, help = "Number of chunks for perplexity benchmark", default = 100)
+    parser.add_argument("-ppl_cs", "--perplexity_chunk_size", type = int, help = "Size of chunks for perplexity benchmark", default = 2048)
+    parser.add_argument("-ppl_ct", "--perplexity_chunk_truncate", type = int, help = "Truncated size of chunks for perplexity benchmark", default = 2048)
+    parser.add_argument("-ppl_co", "--perplexity_chunk_overlap", type = int, help = "Chunk overlap", default = 0)
+    parser.add_argument("-ppl_cm", "--perplexity_chunk_min", type = int, help = "Minimum chunk length", default = 50)
+    parser.add_argument("-ppl_key", "--perplexity_json_key", type = str, help = "Key to extract from JSON dataset, default: 'text'", default = "text")
+    parser.add_argument("-ppl_t", "--perplexity_token", action = "store_true", help = "Run perplexity test on individual tokens, for debug purposes (slow)")
+
+
+def post_parse(args):
+
+    if not args.perplexity: return
+
+    # GPTQ-for-LLaMa equivalent
+
+    if args.perplexity == "gptq-for-llama":
+        args.perplexity_dataset = "datasets/wikitext2.txt"
+        args.perplexity_chunk_num = 128
+        args.perplexity_chunk_size = 2048
+        args.perplexity_chunk_truncate = 2048
+        args.perplexity_chunk_overlap = 0
+        args.perplexity_chunk_min = 0
+
+    # Default dataset for legacy method
+
+    if args.perplexity_dataset is None: args.perplexity_dataset = "datasets/wikitext2_val_sample.jsonl"
+
+    print(f" -- Perplexity:")
+    print(f" -- - Dataset: {args.perplexity_dataset}")
+    print(f" -- - Chunks: {args.perplexity_chunk_num}")
+    print(f" -- - Chunk size: {args.perplexity_chunk_size}" + (f" -> {args.perplexity_chunk_truncate}" if args.perplexity_chunk_truncate is not None else ""))
+    print(f" -- - Chunk overlap: {args.perplexity_chunk_overlap}")
+    print(f" -- - Min. chunk size: {args.perplexity_chunk_min}")
+    print(f" -- - Key: {args.perplexity_json_key}")
+    if args.perplexity_token: print("f -- - Per-token mode")
+
diff --git a/requirements-web.txt b/requirements-web.txt
new file mode 100644
index 00000000..d1bf2563
--- /dev/null
+++ b/requirements-web.txt
@@ -0,0 +1,2 @@
+flask==2.3.2
+waitress==2.1.2
diff --git a/setup.py b/setup.py
index 94cc8596..13e3fa76 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
     name="exllama",
-    version="0.0.4",
+    version="0.0.5",
     install_requires=[
         "torch",
     ],
diff --git a/sh/test_chatbot.sh b/sh/test_chatbot.sh
deleted file mode 100755
index 2595c58d..00000000
--- a/sh/test_chatbot.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-
-python test_chatbot.py -d /mnt/str/models/wizardlm-30b-uncensored-4bit-act-order/ -un "Maxine" -p prompt_assistant.txt -nnl -temp 1.0 -topp .75
-
-#python test_chatbot.py \
-#-t /mnt/str/models/bluemoon-4k-13b-4bit-128g/tokenizer.model \
-#-c /mnt/str/models/bluemoon-4k-13b-4bit-128g/config.json \
-#-m /mnt/str/models/bluemoon-4k-13b-4bit-128g/bluemoonrp-13b-4k-epoch6-4bit-128g.safetensors \
-#-p prompt_bluemoon.txt \
-#-un "Player" \
-#-bn "DM" \
-#-bf \
-#-topk 30 \
-#-topp 0.45 \
-#-minp 0.1 \
-#-temp 1.4 \
-#-repp 1.3 \
-#-repps 256 \
-#-l 4096
\ No newline at end of file
diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
new file mode 100644
index 00000000..0c438631
--- /dev/null
+++ b/test_benchmark_inference.py
@@ -0,0 +1,302 @@
+from model import ExLlama, ExLlamaCache, ExLlamaConfig
+from tokenizer import ExLlamaTokenizer
+from generator import ExLlamaGenerator
+from lora import ExLlamaLora
+import perplexity
+from perplexity import Perplexity
+import time
+import torch
+import torch.nn.functional as F
+import argparse
+import json
+import math
+import sys
+import os
+import glob
+import model_init
+
+torch.cuda._lazy_init()
+# torch.backends.cuda.matmul.allow_tf32 = True
+# torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+torch.set_printoptions(precision = 10)
+torch_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+
+cache = None
+model = None
+
+def begin():
+    global model, cache
+
+    if cache is None: cache = ExLlamaCache(model)
+    else: cache.current_seq_len = 0
+
+
+def next_logits(input_ids, apply_lora, last_id_only = True, input_mask = None):
+    global model, cache
+
+    n_logits = None
+    a = 0
+    while a < input_ids.shape[-1]:
+        b = min(input_ids.shape[-1], a + 2048)
+        n_logits = model.forward(input_ids[:, a:b], cache, last_id_only, lora = apply_lora, input_mask = input_mask)
+        a = b
+
+    return n_logits
+
+
+def tokenize(text):
+    global tokenizer
+
+    return tokenizer.encode(text)
+
+
+def timer(name, func):
+    t = time.time()
+    ret = func()
+    t = time.time() - t
+    print(f" ** Time, {name}: {t:.2f} seconds")
+    return ret
+
+
+mem_base = {}
+mem_last = {}
+for dev in torch_devices:
+    torch.cuda.reset_peak_memory_stats(dev)
+    mem_base[dev] = mem_last[dev] = torch.cuda.max_memory_allocated(dev)
+
+def mem(name, total = False):
+    global mem_base, mem_last
+
+    res = f" ** VRAM, {name}: "
+    first = True
+
+    for device in torch_devices:
+        mem_c = torch.cuda.max_memory_allocated(device)
+        mem_this = mem_c - mem_last[device] if not total else mem_c - mem_base[device]
+        mem_last[device] = mem_c
+
+        if not first: res += " - "
+        first = False
+        res += f"[{device}] {mem_this / (1024 ** 2):,.2f} MB"
+
+    print(res)
+
+
+# Parse arguments
+
+parser = argparse.ArgumentParser(description = "Benchmark tests for ExLlama")
+
+model_init.add_args(parser)
+perplexity.add_args(parser)
+
+parser.add_argument("-p", "--perf", action = "store_true", help = "Benchmark speed and VRAM usage")
+parser.add_argument("-v", "--validate", action = "count", help = "Run validation check and generate some sample output; specify twice for a more thorough test")
+parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark")
+parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark")
+parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark")
+
+args = parser.parse_args()
+
+model_init.post_parse(args)
+perplexity.post_parse(args)
+model_init.get_model_files(args)
+
+# Paths
+
+if args.lora_dir is not None:
+    args.lora_config = os.path.join(args.lora_dir, "adapter_config.json")
+    args.lora = os.path.join(args.lora_dir, "adapter_model.bin")
+
+# Feedback
+
+print_opts = []
+if args.perf: print_opts.append("perf")
+if args.validate: print_opts.append("validate")
+if args.perplexity: print_opts.append("perplexity")
+if args.perplexity_token: print_opts.append("perplexity_token")
+
+model_init.print_options(args, print_opts)
+
+# Instantiate model
+
+config = model_init.make_config(args)
+
+model = timer("Load model", lambda: ExLlama(config))
+tokenizer = timer("Load tokenizer", lambda: ExLlamaTokenizer(args.tokenizer))
+
+model_init.print_stats(model)
+
+torch.cuda.reset_peak_memory_stats("cuda")
+mem("Model")
+
+# Load LoRA
+
+lora = None
+if args.lora:
+    print(f" -- LoRA config: {args.lora_config}")
+    print(f" -- Loading LoRA: {args.lora}")
+    if args.lora_config is None:
+        print(f" ## Error: please specify lora path to adapter_config.json")
+        sys.exit()
+    lora = ExLlamaLora(model, args.lora_config, args.lora)
+    if lora.bias_ignored:
+        print(f" !! Warning: LoRA zero bias ignored")
+
+# Test sequence
+
+gen_tokens = 128
+max_seq_len = args.length
+ids = torch.randint(0, 31999, (1, max_seq_len - gen_tokens)).cuda()
+
+# Benchmark memory and performance
+
+if args.perf:
+
+    # Warming up apparently makes a huge difference
+
+    for i in range(1, 3):
+        print(f" -- Warmup pass {i}...")
+        begin()
+        logits = timer("Warmup", lambda: next_logits(ids, lora))
+
+    # Do the actual benchmark
+
+    begin()
+
+    t = time.time()
+
+    print(" -- Inference, first pass.")
+    logits = timer("Inference", lambda: next_logits(ids, lora))
+
+    t = time.time() - t
+    print(f" ** Speed: {ids.shape[-1] / t:.2f} tokens/second")
+
+    for j in range(2):
+
+        t = time.time()
+        print(f" -- Generating {gen_tokens} tokens, {ids.shape[-1]} token prompt...")
+        for i in range(gen_tokens):
+
+            logits = logits[0, -1, :]
+            token = torch.argmax(logits)
+            next_id = token.unsqueeze(0).unsqueeze(0)
+            logits = next_logits(next_id, lora)
+
+        t = time.time() - t
+        print(f" ** Speed: {gen_tokens / t:.2f} tokens/second")
+
+        ids = ids[:, :4]
+        cache.current_seq_len = 4
+
+    mem("Inference")
+    mem("Total", total = True)
+
+
+# Benchmark perplexity
+
+if args.perplexity:
+
+    ppl = Perplexity(args.perplexity, model, cache, tokenizer)
+
+    print(" -- Loading dataset...")
+
+    ppl.load(dataset_path = args.perplexity_dataset,
+             chunk_size = args.perplexity_chunk_size,
+             chunk_truncate = args.perplexity_chunk_truncate,
+             overlap = args.perplexity_chunk_overlap,
+             minlength = args.perplexity_chunk_min,
+             json_key = args.perplexity_json_key)
+
+    begin()
+
+    ppl.test(args.perplexity_chunk_num,
+             lora = lora,
+             ppl_token = args.perplexity_token)
+
+# Validate file
+
+if args.validate:
+
+    ppl = Perplexity(args.perplexity, model, cache, tokenizer)
+
+    ppl.load(dataset_path = "datasets/wikitext2_val_sample.jsonl",
+             chunk_size = 2048,
+             chunk_truncate = 2048,
+             overlap = 0,
+             minlength = 50,
+             json_key = "text")
+
+    # Short perplexity tests in switched and quant mode, should produce roughly equal results
+
+    begin()
+
+    model.config.matmul_recons_thd = 1
+    ppl.test(8, lora = lora, tag = " (reconstruct)")
+    model.config.matmul_recons_thd = 0
+    ppl.test(8, lora = lora, tag = " (quant, token)", ppl_token = True)
+
+    # Do a short, easy topk=1 completion to see if we're generating garbage. Should run in switched mode
+    # for the prompt and quant for individual tokens
+
+    model.config.matmul_recons_thd = 4
+    generator = ExLlamaGenerator(model, tokenizer, cache)
+    generator.settings.top_k = 1
+    generator.lora = lora
+    text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20 * args.validate)
+    print(f" ** Generation: {repr(text)}")
+
+    if args.validate > 1:
+
+        # Test batched generation
+
+        bsz = 8
+        gen_len = 20
+        torch.manual_seed(42)
+        torch.cuda.manual_seed_all(42)
+
+        # Bigger cache for the batch
+
+        del cache
+        cache = ExLlamaCache(model, batch_size = bsz)
+
+        # Create tokenized batch and attention mask
+
+        identical_batch_prompt = "When you have eliminated the impossible, whatever remains,"
+        continuations = [
+            " must be considered",
+            " ought to be",
+            " (and some scholars say this is",
+            " however improbable, is a banana.",
+        ]
+
+        prompts = [identical_batch_prompt] * (bsz - len(continuations))
+        for cont in continuations:
+            prompts.append(identical_batch_prompt + cont)
+
+        ids = tokenizer.encode(prompts)
+        assert ids.shape[1] < model.config.max_seq_len, f"Max length {ids.shape[1]} exceeds model limit {model.config.max_seq_len}"
+
+        mask = ids.ne(tokenizer.pad_token_id)
+
+        # Batched generation with greedy sampling
+
+        sequence = torch.empty((bsz, 0), dtype = torch.long, device = "cpu")
+        logits = next_logits(ids, lora, input_mask = mask)
+
+        for i in range(gen_len):
+            logits = logits[:, -1, :]
+            id_per_batch = torch.argmax(logits, dim=-1)
+            assert id_per_batch.shape == (bsz,), f"{id_per_batch.shape} != {(bsz,)}"
+            next_id_per_batch = id_per_batch.unsqueeze(-1)
+            sequence = torch.cat((sequence, next_id_per_batch), dim = -1)
+            logits = next_logits(next_id_per_batch, lora)
+
+        # Print output batch
+
+        print(f"\n ** Batching sanity check: 1-{bsz - len(continuations)} should be identical. All should be reasonable for the model you're using.\n")
+
+        outputs = tokenizer.decode(sequence)
+        for b in range(bsz):
+            print(f"{b + 1} {repr(prompts[b])} -> {repr(outputs[b])}")
+
+        # TODO Save the logits and then rerun each prompt with a batch size of 1, same input. The logits should be identical.
diff --git a/webui/app.py b/webui/app.py
index f921cf33..c2fa909f 100644
--- a/webui/app.py
+++ b/webui/app.py
@@ -1,6 +1,7 @@
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from model import ExLlama, ExLlamaConfig
 from flask import Flask, render_template, request, jsonify
 from flask import Response, stream_with_context
 from threading import Timer, Lock
@@ -10,7 +11,7 @@
 from session import prepare_sessions, get_initial_session, Session, load_session, new_session, _sessions_dir
 import argparse
 from tokenizer import ExLlamaTokenizer
-from model import ExLlama, ExLlamaConfig
+from waitress import serve
 
 app = Flask(__name__)
 app.static_folder = 'static'
@@ -117,12 +118,17 @@ def api_userinput():
         result = Response(stream_with_context(session.respond_multi(user_input)), mimetype = 'application/json')
         return result
 
+@app.route("/api/append_block", methods=['POST'])
+def api_append_block():
+    data = request.get_json()
+    session.api_append_block(data)
+    return json.dumps({"result": "ok"}) + "\n"
 
 # Load the model
 
 parser = argparse.ArgumentParser(description="Simple web-based chatbot for ExLlama")
 parser.add_argument("-host", "--host", type = str, help = "IP:PORT eg, 0.0.0.0:7862", default = "localhost:5000")
-parser.add_argument("-sd", "--sessions-dir", type = str, help = "Location for storing user sessions, default: ~/exllama_sessions/", default = "~/exllama_sessions/")
+parser.add_argument("-sd", "--sessions_dir", type = str, help = "Location for storing user sessions, default: ~/exllama_sessions/", default = "~/exllama_sessions/")
 
 model_init.add_args(parser)
 args = parser.parse_args()
@@ -155,4 +161,4 @@ def api_userinput():
 if host == "localhost":
     Timer(1, lambda: webbrowser.open(f'http://{machine}/')).start()
 
-app.run(host = host, port = port)
\ No newline at end of file
+serve(app, host = host, port = port)
\ No newline at end of file
diff --git a/webui/session.py b/webui/session.py
index 4b07993f..5a715113 100644
--- a/webui/session.py
+++ b/webui/session.py
@@ -282,22 +282,23 @@ def api_populate(self):
                 jnode["author_idx"] = self.participants.index(author)
 
         dic = {"sessions": names,
-                "current_session": name,
-                "fixed_prompt": self.fixed_prompt.text,
-                "keep_fixed_prompt": self.keep_fixed_prompt,
-                "participants": self.participants,
-                "history": historyjson,
-                "temperature": generator.settings.temperature,
-                "top_p": generator.settings.top_p,
-                "min_p": generator.settings.min_p,
-                "top_k": generator.settings.top_k,
-                "typical": generator.settings.typical,
-                "break_on_newline": self.break_on_newline,
-                "max_response_tokens": self.max_response_tokens,
-                "chunk_size": self.chunk_size,
-                "token_repetition_penalty_max": generator.settings.token_repetition_penalty_max,
-                "token_repetition_penalty_sustain": generator.settings.token_repetition_penalty_sustain,
-                "token_repetition_penalty_decay": generator.settings.token_repetition_penalty_decay}
+               "current_session": name,
+               "fixed_prompt": self.fixed_prompt.text,
+               "keep_fixed_prompt": self.keep_fixed_prompt,
+               "participants": self.participants,
+               "history": historyjson,
+               "temperature": generator.settings.temperature,
+               "top_p": generator.settings.top_p,
+               "min_p": generator.settings.min_p,
+               "top_k": generator.settings.top_k,
+               "typical": generator.settings.typical,
+               "break_on_newline": self.break_on_newline,
+               "max_response_tokens": self.max_response_tokens,
+               "chunk_size": self.chunk_size,
+               "token_repetition_penalty_max": generator.settings.token_repetition_penalty_max,
+               "token_repetition_penalty_sustain": generator.settings.token_repetition_penalty_sustain,
+               "token_repetition_penalty_decay": generator.settings.token_repetition_penalty_decay,
+               "max_seq_len": model.config.max_seq_len}
 
         # Add model info
 
@@ -336,6 +337,22 @@ def api_edit_block(self, data):
                 return
 
 
+    def api_append_block(self, data):
+
+        author = None
+        if "author" in data:
+            author = data["author"]
+        else:
+            if len(self.participants) > 0:
+                author = self.participants[0]
+
+        text = data["text"].strip()
+
+        newNode = Node(text, author)
+        self.history.append(newNode)
+        self.save()
+
+
     def api_set_participants(self, data):
 
         self.participants = data["participants"]
@@ -599,7 +616,8 @@ def respond_multi(self, user_input):
             end_time = time.time()
             elapsed = end_time - begin_time
             new_tokens = context.shape[-1] - reused
-            print(f"Prompt processed in {elapsed:.2f} seconds, {new_tokens} new tokens, {(new_tokens / elapsed):.2f} tokens/second:")
+            token_rate = 0 if elapsed == 0 else (new_tokens / elapsed)
+            print(f"Prompt processed in {elapsed:.2f} seconds, {new_tokens} new tokens, {token_rate:.2f} tokens/second:")
 
         begin_time = time.time()
         total_tokens = [0]
@@ -676,8 +694,9 @@ def respond_multi(self, user_input):
 
         end_time = time.time()
         elapsed = end_time - begin_time
+        token_rate = 0 if elapsed == 0 else (total_tokens[0] / elapsed)
 
-        print(f"Response generated in {elapsed:.2} seconds, {total_tokens[0]} tokens, {(total_tokens[0] / elapsed):.2f} tokens/second:")
+        print(f"Response generated in {elapsed:.2} seconds, {total_tokens[0]} tokens, {token_rate:.2f} tokens/second:")
 
         self.save()
 
diff --git a/webui/static/main.js b/webui/static/main.js
index 9ec07785..09cf1ce1 100644
--- a/webui/static/main.js
+++ b/webui/static/main.js
@@ -65,13 +65,15 @@ function sendGenSettings() {
     json.token_repetition_penalty_sustain = getTBNumber("sl_repp_sustain_tb");
     json.token_repetition_penalty_decay = getTBNumber("sl_repp_decay_tb");
 
-    console.log(json);
+    // console.log(json);
     send("/api/set_gen_settings", json);
 }
 
-function setSlider(id, value) {
+function setSlider(id, value, override_max = null) {
 
     let slider = document.getElementById(id);
+    if (override_max) slider.max = override_max;
+
     let tb = document.getElementById(id + "_tb");
     let decimals = slider.dataset.decimals;
     let mult = Math.pow(10, decimals);
@@ -256,6 +258,8 @@ function populate() {
             let tf_model_info = document.getElementById("tf_model_info")
             tf_model_info.value = data.model_info;
 
+            let model_max_seq_lan = data.max_seq_len;
+
             // Fixed prompt
 
             let tf_fixed_prompt = document.getElementById("tf_fixed_prompt")
@@ -285,7 +289,7 @@ function populate() {
             // Repetition penalty
 
             setSlider("sl_repp_penalty", data.token_repetition_penalty_max);
-            setSlider("sl_repp_sustain", data.token_repetition_penalty_sustain);
+            setSlider("sl_repp_sustain", data.token_repetition_penalty_sustain, model_max_seq_lan);
             setSlider("sl_repp_decay", data.token_repetition_penalty_decay);
 
             // Participants