diff --git a/.env b/.env index 0b6f65f3..3ab8b5a3 100644 --- a/.env +++ b/.env @@ -1,4 +1,5 @@ PORT=5000 +RUN_UID=1000 # set to 0 to run the service as root inside the container +APPLICATION_STATE_PATH=/data # path to the directory holding application state inside the container MODEL_PATH=models/LLaMA-7B-4bit-128g # replace with the actual model path on the host -CONTAINER_MODEL_PATH=/app/model -SESSIONS_PATH=./exllama_sessions +SESSIONS_PATH=~/exllama_sessions # replace with the actual directory on the host where chat sessions should be stored diff --git a/Dockerfile b/Dockerfile index b7dd15ce..829e9dad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,31 @@ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 as build - -ENV RUN_UID=1000 +ARG RUN_UID="1000" \ + APPLICATION_STATE_PATH="/data" +ENV RUN_UID=$RUN_UID \ + APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH \ + CONTAINER_MODEL_PATH=$APPLICATION_STATE_PATH/model \ + CONTAINER_SESSIONS_PATH=$APPLICATION_STATE_PATH/exllama_sessions RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y ninja-build python3 python3-pip && \ rm -rf /var/lib/apt/lists/* -# Setup user which will run the service -RUN useradd -m -u $RUN_UID user -USER user +# Setup user which will run the service and create application state directory +RUN if [ ${RUN_UID} -ne 0 ] ; then useradd -m -u $RUN_UID user ; fi \ + && mkdir -p $APPLICATION_STATE_PATH \ + && mkdir -p $CONTAINER_MODEL_PATH \ + && mkdir -p $CONTAINER_SESSIONS_PATH \ + && chown -R $RUN_UID $APPLICATION_STATE_PATH +USER $RUN_UID -COPY --chown=user . /app +COPY --chown=$RUN_UID . /app WORKDIR /app +# Create application state directory and install python packages RUN pip install --upgrade pip setuptools wheel \ && pip install -r requirements.txt \ - && pip install flask==2.3.2 + && pip install -r requirements-web.txt USER root diff --git a/README.md b/README.md index d73f5b65..e5b0074d 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,16 @@ incompatibilities with older cards. ## Dependencies +* Python 3.9 or newer * `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118 * `safetensors` 0.3.1 * `sentencepiece` * `ninja` -* `flask` (only for the web UI) + +Additionally, only for the web UI: + +* `flask` +* `waitress` ## Linux/WSL prerequisites @@ -30,7 +35,7 @@ To run on Windows (without WSL): Studio 2022` IDE, or alternatively just the `Build Tools for Visual Studio 2022` package (make sure `Desktop development with C++` is ticked in the installer), it doesn't really matter which. 2. Install the appropriate version of [PyTorch](https://pytorch.org/get-started/locally/), choosing one of the CUDA -versions. I am developing on the nightly build, but the stable version should also work. +versions. I am developing on the nightly build, but the stable version (2.0.1) should also work. 3. Install CUDA Toolkit, ([11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) and [11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive) both seem to work, just make sure to match PyTorch's Compute Platform version). @@ -40,7 +45,7 @@ Compute Platform version). Install dependencies, clone repo and run benchmark: - pip install safetensors sentencepiece ninja + pip install -r requirements.txt git clone https://github.com/turboderp/exllama cd exllama @@ -65,11 +70,12 @@ multibot mode: To run it: - pip install flask + pip install -r requirements-web.txt python webui/app.py -d -Note that sessions are stored in `~/exllama_sessions/`. +Note that sessions are stored in `~/exllama_sessions/`. You can change the location of the sessions storage with `-sd` +if you want. ## Docker For security benefits and easier deployment, it is also possible to run the web UI in an isolated docker container. Note: the docker image currently only supports NVIDIA GPUs. @@ -91,7 +97,13 @@ docker compose build It is also possible to manually build the image: ``` -docker build -t exllama-web +docker build -t exllama-web . +``` + +NOTE: by default, the service inside the docker container is run by a non-root user. Hence, the ownership of bind-mounted directories (`/data/model` and `/data/exllama_sessions` in the default `docker-compose.yml` file) is changed to this non-root user in the container entrypoint (`entrypoint.sh`). To disable this, set `RUN_UID=0` in the `.env` file if using `docker compose`, or the following command if you manually build the image: + +``` +docker build -t exllama-web --build-arg RUN_UID=0 . ``` ### Run @@ -109,7 +121,7 @@ The configuration can be viewed in `docker-compose.yml` and changed by creating Run manually: ``` -docker run --gpus all -p 5000:5000 -v :/app/model/ --rm -it exllama-web --host 0.0.0.0:5000 +docker run --gpus all -p 5000:5000 -v :/data/model/ -v :/data/exllama_sessions --rm -it exllama-web --host 0.0.0.0:5000 ``` @@ -179,20 +191,6 @@ confirmed to be working right now. ## Recent updates -**2023-05-24**: Added fused rotary embeddings and some minor optimizations. 13% faster on 7B, 9% on 13B. Small -improvement on larger models. Added best-case scores to benchmark results and some clarification. For easier -comparisons to other implementations, or whatever. - -**2023-05-27**: Better memory management in CUDA. Introduced auto switch between Torch's SDP backend and regular -matmul attention with some tweaks. Finished CUDA MLP. All in all about 10% faster with these updates. - -**2023-05-29**: Web UI is _almost_ up and running. Having to learn JavaScript, and it turns out I hate JavaScript. But -ChatGPT is an incredible resource for learning new languages, I gotta say, so it's not as painful as it could have -been. Anyway, in the process of working with the UI I discovered I've been measuring prompt speed incorrectly. Either -Torch or CUDA or the GPU driver does some sort of caching or self-calibration or lazy initialization during the first -pass through the model, so subsequent passes are actually _way_ faster than what I've been recording. Doesn't do much -for individual tokens, but benchmarks updated anyway. Closing in on 10k tokens/second for 7B. (!) - **2023-06-02**: Web UI is now in a fairly working state. Expect it to be a little scuffed in places. There will be a rewrite at some point to make the client-side code less seizure-inducing. It has multibot mode, chat rewind and editing features, sessions, and more. I'm going to build it out with support for instruct prompting and such, in time. @@ -216,5 +214,5 @@ disabled by default. YMMV. Use `-cs` to try it out. **2023-06-17**: Fixed a nasty bug in the fused attention that was causing slightly incorrect cache states on 13B and 33B models. You definitely want to update. -**2023-06-18**: LoRA support now. Still needs a lot of testing and som optimization, and currently you can't stack -multiple LoRAs during the same inference. There's also no support in the web UI yet. \ No newline at end of file +**2023-06-18**: LoRA support now. Still needs a lot of testing and some optimization, and currently you can't stack +multiple LoRAs during the same inference. There's also no support in the web UI yet. diff --git a/cuda_ext.py b/cuda_ext.py index 91b4397b..5efd3d4a 100644 --- a/cuda_ext.py +++ b/cuda_ext.py @@ -30,14 +30,15 @@ def find_msvc(): import subprocess try: - subprocess.check_output(["where", "cl"]) + subprocess.check_output(["where", "/Q", "cl"]) except subprocess.CalledProcessError as e: cl_path = find_msvc() if cl_path: - print("Injected compiler path:", cl_path) + if verbose: + print("Injected compiler path:", cl_path) os.environ["path"] += ";" + cl_path else: - print("Unable to find cl.exe; compilation will probably fail.") + print("Unable to find cl.exe; compilation will probably fail.", file=sys.stderr) exllama_ext = load( name = extension_name, @@ -56,7 +57,7 @@ def find_msvc(): ], extra_include_paths = [os.path.join(library_dir, "exllama_ext")], verbose = verbose, - extra_ldflags = ["cublas.lib"] if windows else [], + extra_ldflags = (["cublas.lib"] + ([f"/LIBPATH:{os.path.join(sys.base_prefix, 'libs')}"] if sys.base_prefix != sys.prefix else [])) if windows else [], extra_cuda_cflags = ["-lineinfo"] + (["-U__HIP_NO_HALF_CONVERSIONS__", "-O3"] if torch.version.hip else []), extra_cflags = ["-O3"] # extra_cflags = ["-ftime-report", "-DTORCH_USE_CUDA_DSA"] @@ -73,6 +74,7 @@ def find_msvc(): from exllama_ext import rms_norm from exllama_ext import rope_ from exllama_ext import rep_penalty +from exllama_ext import apply_rep_penalty # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension @@ -157,3 +159,9 @@ def ext_rep_penalty_mask_cpu(vocab_size, sequence, penalty_max, sustain, decay): rep_mask = torch.empty(vocab_size, dtype = torch.float32) rep_penalty(sequence, rep_mask, penalty_max, sustain, decay) return rep_mask + + +def ext_apply_rep_penalty_mask_cpu(sequence, penalty_max, sustain, decay, logits): + + apply_rep_penalty(sequence, penalty_max, sustain, decay, logits) + diff --git a/cuda_test/compile.sh b/cuda_test/compile.sh deleted file mode 100755 index 3f04d83b..00000000 --- a/cuda_test/compile.sh +++ /dev/null @@ -1,7 +0,0 @@ -/opt/cuda/bin/nvcc -isystem /opt/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 --compiler-options '-fPIC' -std=c++17 \ -test.cu \ -../exllama_ext/q4v2_mlp.cu \ -../exllama_ext/rms_norm.cu \ -../exllama_ext/q4v2_matmul.cu \ -../exllama_ext/column_remap.cu \ --o ./test diff --git a/cuda_test/test.cu b/cuda_test/test.cu deleted file mode 100644 index 47a2e69b..00000000 --- a/cuda_test/test.cu +++ /dev/null @@ -1,339 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../exllama_ext/util.h" -#include "../exllama_ext/matrix.h" -#include "../exllama_ext/q4v2_mlp.h" - -using namespace std; - -template -class Tensor -{ -public: - T* data_cuda; - T* data_cpu; - uint32_t height; - uint32_t width; - - // From file - - Tensor(const char* filename) - { - FILE* file = fopen(filename, "rb"); - if (!file) - { - cout << "File not found: " << filename << "\n"; - return; - } - - fseek(file, 0, SEEK_END); - long size = ftell(file); - fseek(file, 0, SEEK_SET); - size_t num_elements = size / sizeof(T); - - if (num_elements == 0) - { - data_cuda = NULL; - data_cpu = NULL; - height = 0; - width = 0; - - cout << " ** " << filename << " (None)\n"; - return; - } - - data_cpu = new T[num_elements]; - fread(data_cpu, sizeof(T), num_elements, file); - fclose(file); - - char filenameshape[1024]; - strcpy(filenameshape, filename); - strcat(filenameshape, ".shape"); - - file = fopen(filenameshape, "rb"); - if (!file) - { - cout << "File not found: " << filenameshape << "\n"; - return; - } - - fread(&height, 1, sizeof(uint32_t), file); - fread(&width, 1, sizeof(uint32_t), file); - fclose(file); - - if (width * height != num_elements) - { - cout << "Incorrect shape: " << filenameshape << "\n"; - return; - } - - cudaMalloc(&data_cuda, size); - dataToCUDA(); - - cout << " ** " << filename << " (" << height << ", " << width << ")\n"; - } - - // Empty tensor - - Tensor(int _height, int _width) - { - height = _height; - width = _width; - - size_t size = (height * width) * sizeof(T); - cudaMalloc(&data_cuda, size); - - data_cpu = new T[height * width]; - } - - // Zero tensor - - Tensor(int _height, int _width, T zero_value) - { - height = _height; - width = _width; - - size_t size = (height * width) * sizeof(T); - cudaMalloc(&data_cuda, size); - - data_cpu = new T[height * width]; - - for (int i = 0; i < _width * _height; i++) data_cpu[i] = zero_value; - dataToCUDA(); - } - - // Fill - - void fill(T value) - { - for (int i = 0; i < width * height; i++) data_cpu[i] = value; - dataToCUDA(); - } - - // Copy data - - void dataToCUDA() - { - size_t size = (height * width) * sizeof(T); - cudaMemcpy(data_cuda, data_cpu, size, cudaMemcpyHostToDevice); - cudaDeviceSynchronize(); - } - - void dataToCPU() - { - size_t size = (height * width) * sizeof(T); - cudaMemcpy(data_cpu, data_cuda, size, cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - } - -}; - -__global__ void dummyKernel() -{ - // Dummy kernel -} - -void warmUpCUDA() -{ - // Create a CUDA context - cudaFree(0); - - // Launch a dummy kernel - dummyKernel<<<1, 1>>>(); - cudaDeviceSynchronize(); -} - - -template -float compareTensors(Tensor& a, Tensor& b, int height = 0, int width = 0) -{ - if (height == 0 && (a.width != b.width || a.height != b.height)) - { - cout << "Incompatible sizes.\n"; - return std::numeric_limits::infinity(); - } - - if (height == 0) height = a.height; - if (width == 0) width = a.width; - - a.dataToCPU(); - b.dataToCPU(); - - float m = 0.0f; - - for (int r = 0; r < height; r++) - { - for (int c = 0; c < width; c++) - { - float a_f = __half2float(a.data_cpu[r * a.width + c]); - float b_f = __half2float(b.data_cpu[r * b.width + c]); - m = fmax(m, fabs(a_f - b_f)); - } - } - - return m; -} - -void printTensor(Tensor& a) -{ - int width = 8; if (width > a.width) width = a.width; - int height = 8; if (height > a.height) height = a.height; - - a.dataToCPU(); - - for (int c = 0; c < width; c++) cout << "---------"; - cout << "\n"; - - for (int r = 0; r < height; r++) - { - for (int c = 0; c < width; c++) - { - float a_f = __half2float(a.data_cpu[r * a.width + c]); - cout << setfill(' ') << setprecision(5) << setw(9) << a_f << dec; - } - cout << "\n"; - } -} - -void printTensor(Tensor& a) -{ - int width = 8; if (width > a.width) width = a.width; - int height = 8; if (height > a.height) height = a.height; - - a.dataToCPU(); - - for (int c = 0; c < width; c++) cout << "---------"; - cout << "\n"; - - for (int r = 0; r < height; r++) - { - for (int c = 0; c < width; c++) - { - uint32_t a_i = a.data_cpu[r * a.width + c]; - cout << " " << setfill('0') << setw(8) << hex << a_i << dec << setfill(' '); - } - cout << "\n"; - } -} - - -int main() -{ - warmUpCUDA(); - int iters; - - cout << fixed << setprecision(6); - cout << "Loading tensors...\n"; - - // Test MLP - - Tensor x ("mlp/test_mlp_x"); - Tensor x_gated ("mlp/test_mlp_x_gated"); - Tensor x_done ("mlp/test_mlp_x_done"); - Tensor x_prenorm ("mlp/test_mlp_x_prenorm"); - Tensor x_postresidual ("mlp/test_mlp_x_postresidual"); - - Tensor rms_norm_weight ("mlp/test_mlp_norm_weight"); - - Tensor up_proj_bias ("mlp/up_proj.bias"); - Tensor up_proj_qweight ("mlp/up_proj.qweight"); - Tensor up_proj_qzeros ("mlp/up_proj.qzeros"); - Tensor up_proj_scales ("mlp/up_proj.scales"); - Tensor up_proj_seq_g_idx ("mlp/up_proj.seq_g_idx"); - Tensor up_proj_x_map ("mlp/up_proj.x_map"); - - Tensor gate_proj_bias ("mlp/gate_proj.bias"); - Tensor gate_proj_qweight ("mlp/gate_proj.qweight"); - Tensor gate_proj_qzeros ("mlp/gate_proj.qzeros"); - Tensor gate_proj_scales ("mlp/gate_proj.scales"); - Tensor gate_proj_seq_g_idx ("mlp/gate_proj.seq_g_idx"); - Tensor gate_proj_x_map ("mlp/gate_proj.x_map"); - - Tensor down_proj_bias ("mlp/down_proj.bias"); - Tensor down_proj_qweight ("mlp/down_proj.qweight"); - Tensor down_proj_qzeros ("mlp/down_proj.qzeros"); - Tensor down_proj_scales ("mlp/down_proj.scales"); - Tensor down_proj_seq_g_idx ("mlp/down_proj.seq_g_idx"); - Tensor down_proj_x_map ("mlp/down_proj.x_map"); - - Tensor x_temp(x.height, x.width); - Tensor x_col_temp(1, x.height); - Tensor x_act_temp(x.height, gate_proj_qweight.width); - - Tensor out(x_gated.height, x_gated.width); - - int groupsize = gate_proj_qweight.height * 8 / gate_proj_qzeros.height; - - iters = 1; - auto start_time = chrono::high_resolution_clock::now(); - - cout << "--------\n"; - - cout << "Fused MLP (" << iters << " iterations)... "; - - for (int i = 0; i < iters; i++) - { - q4v2_mlp_cuda - ( - x_prenorm.data_cuda, // input - - x_temp.data_cuda, // input, normalized (empty) - x_col_temp.data_cuda, // temp for norm (empty) - x_act_temp.data_cuda, // temp for act(x @ gate) * x @ up (empty) - - rms_norm_weight.data_cuda, - (1e-06), - - gate_proj_qweight.data_cuda, - gate_proj_scales.data_cuda, - gate_proj_qzeros.data_cuda, - gate_proj_seq_g_idx.data_cuda, - gate_proj_x_map.data_cuda, - - up_proj_qweight.data_cuda, - up_proj_scales.data_cuda, - up_proj_qzeros.data_cuda, - up_proj_seq_g_idx.data_cuda, - up_proj_x_map.data_cuda, - - down_proj_qweight.data_cuda, - down_proj_scales.data_cuda, - down_proj_qzeros.data_cuda, - down_proj_seq_g_idx.data_cuda, - down_proj_x_map.data_cuda, - - x.height, - x.width, - gate_proj_qweight.width, - groupsize - ); - } - - cudaDeviceSynchronize(); - - auto end_time = chrono::high_resolution_clock::now(); - auto duration = chrono::duration_cast(end_time - start_time).count(); - duration /= iters; - cout << duration << " us / iteration\n"; - - cout << "Validating fused MLP... "; - - float diff = compareTensors(x_prenorm, x_postresidual); - - cout << "max diff.: " << diff <<"\n"; - - printTensor(x_prenorm); - printTensor(x_postresidual); - - printf("Done\n"); - return 0; -} diff --git a/datasets/download_datasets.py b/datasets/download_datasets.py index 872435cd..804ff7f6 100644 --- a/datasets/download_datasets.py +++ b/datasets/download_datasets.py @@ -11,7 +11,7 @@ def download_hf(filename, dataset, subset, split, key, div): hf_dataset = load_dataset(dataset, subset, split = split) data = div.join(hf_dataset[key]) - with open(filename, "w") as f: + with open(filename, "w", encoding="utf-8") as f: f.write(data) download_hf("wikitext2.txt", "wikitext", "wikitext-2-raw-v1", "test", "text", "\n\n") diff --git a/docker-compose.yml b/docker-compose.yml index 5e47c1c2..369b6864 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,15 +4,16 @@ services: web: build: context: . + args: + - RUN_UID=$RUN_UID + - APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH command: | --host 0.0.0.0:$PORT env_file: - .env - environment: - - CONTAINER_MODEL_PATH=$CONTAINER_MODEL_PATH volumes: - - $MODEL_PATH:$CONTAINER_MODEL_PATH - - $SESSIONS_PATH:/home/user/exllama_sessions + - $MODEL_PATH:$APPLICATION_STATE_PATH/model + - $SESSIONS_PATH:$APPLICATION_STATE_PATH/exllama_sessions ports: - "$PORT:$PORT" tmpfs: diff --git a/entrypoint.sh b/entrypoint.sh index 79864aa8..74d077f7 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,15 +1,19 @@ #!/usr/bin/env bash set -Eeuo pipefail -# Ensure that the model path is set -if [ -z $CONTAINER_MODEL_PATH ]; then - echo "Must specify model path" +# Ensure that the application state path is set +if [ -z $APPLICATION_STATE_PATH ]; then + echo "Must specify application state path" exit 1 fi -# Ensure that bind-mounted directories are owned by the user that runs the service -chown -R $RUN_UID:$RUN_UID $CONTAINER_MODEL_PATH -chown -R $RUN_UID:$RUN_UID /home/user/exllama_sessions +# Ensure that bind-mounted directories are owned by the user that runs the service if the user is not root +if [ $RUN_UID -ne 0 ]; then + chown -R $RUN_UID:$RUN_UID $APPLICATION_STATE_PATH +fi # Run service as specified (non-root) user -exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py -d $CONTAINER_MODEL_PATH $@ +exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py \ + -d $CONTAINER_MODEL_PATH \ + --sessions_dir $CONTAINER_SESSIONS_PATH \ + $@ diff --git a/example_basic.py b/example_basic.py deleted file mode 100644 index 8db3b424..00000000 --- a/example_basic.py +++ /dev/null @@ -1,45 +0,0 @@ -from model import ExLlama, ExLlamaCache, ExLlamaConfig -from tokenizer import ExLlamaTokenizer -from generator import ExLlamaGenerator -import os, glob - -# Directory containt model, tokenizer, generator - -model_directory = "/mnt/str/models/llama-13b-4bit-128g/" - -# Locate files we need within that directory - -tokenizer_path = os.path.join(model_directory, "tokenizer.model") -model_config_path = os.path.join(model_directory, "config.json") -st_pattern = os.path.join(model_directory, "*.safetensors") -model_path = glob.glob(st_pattern)[0] - -# Create config, model, tokenizer and generator - -config = ExLlamaConfig(model_config_path) # create config from config.json -config.model_path = model_path # supply path to model weights file - -model = ExLlama(config) # create ExLlama instance and load the weights -tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file - -cache = ExLlamaCache(model) # create cache for inference -generator = ExLlamaGenerator(model, tokenizer, cache) # create generator - -# Configure generator - -generator.disallow_tokens([tokenizer.eos_token_id]) - -generator.settings.token_repetition_penalty_max = 1.2 -generator.settings.temperature = 0.95 -generator.settings.top_p = 0.65 -generator.settings.top_k = 100 -generator.settings.typical = 0.5 - -# Produce a simple generation - -prompt = "Once upon a time," -print (prompt, end = "") - -output = generator.generate_simple(prompt, max_new_tokens = 200) - -print(output[len(prompt):]) diff --git a/example_chatbot.py b/example_chatbot.py deleted file mode 100644 index 6e450d22..00000000 --- a/example_chatbot.py +++ /dev/null @@ -1,209 +0,0 @@ -import argparse -import torch -import sys -import os -import glob -import model_init - -from .model import ExLlama, ExLlamaCache, ExLlamaConfig -from .tokenizer import ExLlamaTokenizer -from .generator import ExLlamaGenerator - -# Simple interactive chatbot script - -torch.set_grad_enabled(False) -torch.cuda._lazy_init() - -# Parse arguments - -parser = argparse.ArgumentParser(description = "Simple chatbot example for ExLlama") - -model_init.add_args(parser) - -parser.add_argument("-p", "--prompt", type = str, help = "Prompt file") -parser.add_argument("-un", "--username", type = str, help = "Display name of user", default = "User") -parser.add_argument("-bn", "--botname", type = str, help = "Display name of chatbot", default = "Chatbort") -parser.add_argument("-bf", "--botfirst", action = "store_true", help = "Start chat on bot's turn") - -parser.add_argument("-nnl", "--no_newline", action = "store_true", help = "Do not break bot's response on newline (allow multi-paragraph responses)") -parser.add_argument("-temp", "--temperature", type = float, help = "Temperature", default = 0.95) -parser.add_argument("-topk", "--top_k", type = int, help = "Top-K", default = 20) -parser.add_argument("-topp", "--top_p", type = float, help = "Top-P", default = 0.65) -parser.add_argument("-minp", "--min_p", type = float, help = "Min-P", default = 0.00) -parser.add_argument("-repp", "--repetition_penalty", type = float, help = "Repetition penalty", default = 1.15) -parser.add_argument("-repps", "--repetition_penalty_sustain", type = int, help = "Past length for repetition penalty", default = 256) -parser.add_argument("-beams", "--beams", type = int, help = "Number of beams for beam search", default = 1) -parser.add_argument("-beamlen", "--beam_length", type = int, help = "Number of future tokens to consider", default = 1) - -args = parser.parse_args() -model_init.post_parse(args) -model_init.get_model_files(args) - -# Some feedback - -print(f" -- Sequence length: {args.length}") -print(f" -- Temperature: {args.temperature:.2f}") -print(f" -- Top-K: {args.top_k}") -print(f" -- Top-P: {args.top_p:.2f}") -print(f" -- Min-P: {args.min_p:.2f}") -print(f" -- Repetition penalty: {args.repetition_penalty:.2f}") -print(f" -- Beams: {args.beams} x {args.beam_length}") - -print_opts = [] -if args.no_newline: print_opts.append("no_newline") -if args.botfirst: print_opts.append("botfirst") - -model_init.print_options(args, print_opts) - -username = args.username -bot_name = args.botname - -# Load prompt file - -if args.prompt is not None: - with open(args.prompt, "r") as f: - past = f.read() - past = past.replace("{username}", username) - past = past.replace("{bot_name}", bot_name) - past = past.strip() + "\n" -else: - past = f"{bot_name}: Hello, {username}\n" - -# past += "User: Hi. Please say \"Shhhhhh\"?\n" -# args.botfirst = True - -# Instantiate model and generator - -config = model_init.make_config(args) - -model = ExLlama(config) -cache = ExLlamaCache(model) -tokenizer = ExLlamaTokenizer(args.tokenizer) - -model_init.print_stats(model) - -generator = ExLlamaGenerator(model, tokenizer, cache) -generator.settings = ExLlamaGenerator.Settings() -generator.settings.temperature = args.temperature -generator.settings.top_k = args.top_k -generator.settings.top_p = args.top_p -generator.settings.min_p = args.min_p -generator.settings.token_repetition_penalty_max = args.repetition_penalty -generator.settings.token_repetition_penalty_sustain = args.repetition_penalty_sustain -generator.settings.token_repetition_penalty_decay = generator.settings.token_repetition_penalty_sustain // 2 -generator.settings.beams = args.beams -generator.settings.beam_length = args.beam_length - -break_on_newline = not args.no_newline - -# Be nice to Chatbort - -min_response_tokens = 4 -max_response_tokens = 256 -extra_prune = 256 - -print(past, end = "") -ids = tokenizer.encode(past) -generator.gen_begin(ids) - -next_userprompt = username + ": " - -first_round = True - -while True: - - res_line = bot_name + ":" - res_tokens = tokenizer.encode(res_line) - num_res_tokens = res_tokens.shape[-1] # Decode from here - - if first_round and args.botfirst: in_tokens = res_tokens - - else: - - # Read and format input - - in_line = input(next_userprompt) - in_line = username + ": " + in_line.strip() + "\n" - - next_userprompt = username + ": " - - # No need for this, really, unless we were logging the chat. The actual history we work on is kept in the - # tokenized sequence in the generator and the state in the cache. - - past += in_line - - # SentencePiece doesn't tokenize spaces separately so we can't know from individual tokens if they start a new word - # or not. Instead, repeatedly decode the generated response as it's being built, starting from the last newline, - # and print out the differences between consecutive decodings to stream out the response. - - in_tokens = tokenizer.encode(in_line) - in_tokens = torch.cat((in_tokens, res_tokens), dim = 1) - - # If we're approaching the context limit, prune some whole lines from the start of the context. Also prune a - # little extra so we don't end up rebuilding the cache on every line when up against the limit. - - expect_tokens = in_tokens.shape[-1] + max_response_tokens - max_tokens = config.max_seq_len - expect_tokens - if generator.gen_num_tokens() >= max_tokens: - generator.gen_prune_to(config.max_seq_len - expect_tokens - extra_prune, tokenizer.newline_token_id) - - # Feed in the user input and "{bot_name}:", tokenized - - generator.gen_feed_tokens(in_tokens) - - # Generate with streaming - - print(res_line, end = "") - sys.stdout.flush() - - generator.begin_beam_search() - - for i in range(max_response_tokens): - - # Disallowing the end condition tokens seems like a clean way to force longer replies. - - if i < min_response_tokens: - generator.disallow_tokens([tokenizer.newline_token_id, tokenizer.eos_token_id]) - else: - generator.disallow_tokens(None) - - # Get a token - - gen_token = generator.beam_search() - - # If token is EOS, replace it with newline before continuing - - if gen_token.item() == tokenizer.eos_token_id: - generator.replace_last_token(tokenizer.newline_token_id) - - # Decode the current line and print any characters added - - num_res_tokens += 1 - text = tokenizer.decode(generator.sequence_actual[:, -num_res_tokens:][0]) - new_text = text[len(res_line):] - - skip_space = res_line.endswith("\n") and new_text.startswith(" ") # Bit prettier console output - res_line += new_text - if skip_space: new_text = new_text[1:] - - print(new_text, end="") # (character streaming output is here) - sys.stdout.flush() - - # End conditions - - if break_on_newline and gen_token.item() == tokenizer.newline_token_id: break - if gen_token.item() == tokenizer.eos_token_id: break - - # Some models will not (or will inconsistently) emit EOS tokens but in a chat sequence will often begin - # generating for the user instead. Try to catch this and roll back a few tokens to begin the user round. - - if res_line.endswith(f"{username}:"): - plen = tokenizer.encode(f"{username}:").shape[-1] - generator.gen_rewind(plen) - next_userprompt = " " - break - - generator.end_beam_search() - - past += res_line - first_round = False diff --git a/example_flask.py b/example_flask.py deleted file mode 100644 index 58aca5fe..00000000 --- a/example_flask.py +++ /dev/null @@ -1,92 +0,0 @@ -from flask import Flask, request -from model import ExLlama, ExLlamaCache, ExLlamaConfig -from tokenizer import ExLlamaTokenizer -from generator import ExLlamaGenerator -import os, glob - -# Directory containing config.json, tokenizer.model and safetensors file for the model -model_directory = "/mnt/str/models/llama-7b-4bit/" - -tokenizer_path = os.path.join(model_directory, "tokenizer.model") -model_config_path = os.path.join(model_directory, "config.json") -st_pattern = os.path.join(model_directory, "*.safetensors") -model_path = glob.glob(st_pattern)[0] - -config = ExLlamaConfig(model_config_path) # create config from config.json -config.model_path = model_path # supply path to model weights file - -model = ExLlama(config) # create ExLlama instance and load the weights -print(f"Model loaded: {model_path}") - -tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file -cache = ExLlamaCache(model) # create cache for inference -generator = ExLlamaGenerator(model, tokenizer, cache) # create generator - -# Flask app - -app = Flask(__name__) - - -# Inference with settings equivalent to the "precise" preset from the /r/LocalLLaMA wiki - -@app.route('/infer_precise', methods=['POST']) -def inferContextP(): - print(request.form) - prompt = request.form.get('prompt') - - generator.settings.token_repetition_penalty_max = 1.176 - generator.settings.token_repetition_penalty_sustain = config.max_seq_len - generator.settings.temperature = 0.7 - generator.settings.top_p = 0.1 - generator.settings.top_k = 40 - generator.settings.typical = 0.0 # Disabled - - outputs = generator.generate_simple(prompt, max_new_tokens = 200) - return outputs - - -# Inference with settings equivalent to the "creative" preset from the /r/LocalLLaMA wiki - -@app.route('/infer_creative', methods=['POST']) -def inferContextC(): - print(request.form) - prompt = request.form.get('prompt') - - generator.settings.token_repetition_penalty_max = 1.1 - generator.settings.token_repetition_penalty_sustain = config.max_seq_len - generator.settings.temperature = 0.72 - generator.settings.top_p = 0.73 - generator.settings.top_k = 0 # Disabled - generator.settings.typical = 0.0 # Disabled - - outputs = generator.generate_simple(prompt, max_new_tokens = 200) - return outputs - - -# Inference with settings equivalent to the "sphinx" preset from the /r/LocalLLaMA wiki - -@app.route('/infer_sphinx', methods=['POST']) -def inferContextS(): - print(request.form) - prompt = request.form.get('prompt') - - generator.settings.token_repetition_penalty_max = 1.15 - generator.settings.token_repetition_penalty_sustain = config.max_seq_len - generator.settings.temperature = 1.99 - generator.settings.top_p = 0.18 - generator.settings.top_k = 30 - generator.settings.typical = 0.0 # Disabled - - outputs = generator.generate_simple(prompt, max_new_tokens = 200) - return outputs - - -# Start Flask app - -host = "0.0.0.0" -port = 8004 -print(f"Starting server on address {host}:{port}") - -if __name__ == '__main__': - from waitress import serve - serve(app, host = host, port = port) diff --git a/example_lora.py b/example_lora.py deleted file mode 100644 index e47c34f5..00000000 --- a/example_lora.py +++ /dev/null @@ -1,79 +0,0 @@ -from model import ExLlama, ExLlamaCache, ExLlamaConfig -from tokenizer import ExLlamaTokenizer -from generator import ExLlamaGenerator -from lora import ExLlamaLora -import os, glob -import torch - -# Directory containt model, tokenizer, generator - -model_directory = "/mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-7B-4bit-128g/" - -# Directory containing LoRA config and weights - -lora_directory = "/mnt/str/models/_test_loras/tloen_alpaca-lora-7b/" - -# Locate files we need within those directories - -tokenizer_path = os.path.join(model_directory, "tokenizer.model") -model_config_path = os.path.join(model_directory, "config.json") -st_pattern = os.path.join(model_directory, "*.safetensors") -model_path = glob.glob(st_pattern)[0] - -lora_config_path = os.path.join(lora_directory, "adapter_config.json") -lora_path = os.path.join(lora_directory, "adapter_model.bin") - -# Create config, model, tokenizer and generator - -config = ExLlamaConfig(model_config_path) # create config from config.json -config.model_path = model_path # supply path to model weights file - -model = ExLlama(config) # create ExLlama instance and load the weights -tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file - -cache = ExLlamaCache(model) # create cache for inference -generator = ExLlamaGenerator(model, tokenizer, cache) # create generator - -# Load LoRA - -lora = ExLlamaLora(model, lora_config_path, lora_path) - -# Configure generator - -generator.settings.token_repetition_penalty_max = 1.2 -generator.settings.temperature = 0.65 -generator.settings.top_p = 0.4 -generator.settings.top_k = 0 -generator.settings.typical = 0.0 - -# Alpaca prompt - -prompt = \ - "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n" \ - "\n" \ - "### Instruction:\n" \ - "List five colors in alphabetical order.\n" \ - "\n" \ - "### Response:" - -# Generate with LoRA - -print(" --- LoRA ----------------- ") -print("") - -generator.lora = lora -torch.manual_seed(1337) -output = generator.generate_simple(prompt, max_new_tokens = 200) -print(output) - -# Generate without LoRA - -print("") -print(" --- No LoRA -------------- ") -print("") - -generator.lora = None -torch.manual_seed(1337) -output = generator.generate_simple(prompt, max_new_tokens = 200) -print(output) - diff --git a/exllama/generator.py b/exllama/generator.py index 2d02a6da..4ba2f0a7 100644 --- a/exllama/generator.py +++ b/exllama/generator.py @@ -6,6 +6,7 @@ import torch import torch.nn.functional as F +DEFAULT_MAX_CHUNK = 2048 class ExLlamaGenerator: @@ -62,11 +63,27 @@ def make_rep_mask(self, penalty_max, sustain, decay): return cuda_ext.ext_rep_penalty_mask_cpu(self.model.config.vocab_size, self.sequence, penalty_max, sustain, decay) + def batched_sample(self, logits, temperature, top_k, top_p, min_p, typical, num = 1): + + if logits.shape[0] == 1: return self.sample(logits, temperature, top_k, top_p, min_p, typical, num) + + samples = [] + scores = [] + for i in range(logits.shape[0]): + t, s = self.sample(logits[i, :, :], temperature, top_k, top_p, min_p, typical) + samples.append(t) + scores.append(s) + + return torch.cat(samples, dim = 0), torch.cat(scores, dim = 0) + + def sample(self, logits, temperature, top_k, top_p, min_p, typical, num = 1): # torch.manual_seed(42) - logits = logits[0, -1, :] + if logits.dim() == 3: logits = logits[0, -1, :] + elif logits.dim() == 2: logits = logits[-1, :] + else: raise ValueError("Bad logits dimension") # Disallow tokens @@ -147,7 +164,7 @@ def disallow_tokens(self, tokens): self.disallowed_tokens = tokens - def gen_begin(self, in_tokens): + def gen_begin(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK): self.end_beam_search() @@ -156,7 +173,11 @@ def gen_begin(self, in_tokens): self.cache.current_seq_len = 0 if in_tokens.shape[-1] > 1: - self.model.forward(self.sequence[:, :-1], self.cache, preprocess_only = True, lora = self.lora) + a = 0 + while a < self.sequence.shape[-1] - 1: + b = min(a + max_chunk, self.sequence.shape[-1] - 1) + self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora) + a = b def gen_begin_empty(self): @@ -167,11 +188,11 @@ def gen_begin_empty(self): self.cache.current_seq_len = 0 - def gen_begin_reuse(self, in_tokens): + def gen_begin_reuse(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK): self.end_beam_search() if self.sequence is None or self.cache.current_seq_len == 0: - self.gen_begin(in_tokens) + self.gen_begin(in_tokens, max_chunk) return 0 # if in_tokens.shape[-1] < self.sequence.shape[-1]: @@ -182,7 +203,7 @@ def gen_begin_reuse(self, in_tokens): reuse += 1 if reuse < 2: - self.gen_begin(in_tokens) + self.gen_begin(in_tokens, max_chunk) return 0 # print (f"Reusing cache: {reuse} tokens") @@ -195,10 +216,10 @@ def gen_begin_reuse(self, in_tokens): return reuse - def gen_feed_tokens(self, in_tokens): + def gen_feed_tokens(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK): if self.sequence is None: - self.gen_begin(in_tokens) + self.gen_begin(in_tokens, max_chunk) return self.end_beam_search() @@ -209,7 +230,12 @@ def gen_feed_tokens(self, in_tokens): self.sequence = in_tokens.clone() else: self.sequence = torch.cat((self.sequence, in_tokens), dim = 1) - self.model.forward(self.sequence[:, start:-1], self.cache, preprocess_only = True, lora = self.lora) + + a = start + while a < self.sequence.shape[-1] - 1: + b = min(a + max_chunk, self.sequence.shape[-1] - 1) + self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora) + a = b self.sequence_actual = self.sequence @@ -277,7 +303,7 @@ def gen_num_tokens(self): return self.sequence_actual.shape[-1] - # Generate some number of tokens and append to + # Simple generator function def generate_simple(self, prompt, max_new_tokens = 128): @@ -286,17 +312,22 @@ def generate_simple(self, prompt, max_new_tokens = 128): ids = self.tokenizer.encode(prompt) self.gen_begin(ids) + max_new_tokens = min(max_new_tokens, self.model.config.max_seq_len - ids.shape[1]) + + eos = torch.zeros((ids.shape[0],), dtype = torch.bool) for i in range(max_new_tokens): token = self.gen_single_token() - if token.item() == self.tokenizer.eos_token_id: break + for j in range(token.shape[0]): + if token[j, 0].item() == self.tokenizer.eos_token_id: eos[j] = True + if eos.all(): break - text = self.tokenizer.decode(self.sequence[0]) + text = self.tokenizer.decode(self.sequence[0] if self.sequence.shape[0] == 1 else self.sequence) return text # Generate a single token with the current settings, append to sequence - def gen_single_token(self, constraints = None, lora = None): + def gen_single_token(self, constraints = None): self.end_beam_search() @@ -304,12 +335,14 @@ def gen_single_token(self, constraints = None, lora = None): if self.sequence is not None: - rep_mask = self.make_rep_mask(self.settings.token_repetition_penalty_max, - self.settings.token_repetition_penalty_sustain, - self.settings.token_repetition_penalty_decay) - logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora) - logits /= rep_mask + + cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence, + self.settings.token_repetition_penalty_max, + self.settings.token_repetition_penalty_sustain, + self.settings.token_repetition_penalty_decay, + logits) + logits[:, :, self.tokenizer.bos_token_id] = -10000.0 if constraints is not None: @@ -317,12 +350,12 @@ def gen_single_token(self, constraints = None, lora = None): for c in constraints: logits[:, :, c] += 10000.0 logits[:, :, :] -= 10000.0 - token, _ = self.sample(logits, - self.settings.temperature, - self.settings.top_k, - self.settings.top_p, - self.settings.min_p + 0.01 if constraints is not None else 0.0, - self.settings.typical) + token, _ = self.batched_sample(logits, + self.settings.temperature, + self.settings.top_k, + self.settings.top_p, + self.settings.min_p + 0.01 if constraints is not None else 0.0, + self.settings.typical) else: @@ -481,13 +514,14 @@ def beam_search(self): # Initial tokens for initial beams - rep_mask = self.make_rep_mask(self.settings.token_repetition_penalty_max, - self.settings.token_repetition_penalty_sustain, - self.settings.token_repetition_penalty_decay) - # self.cache.debug() logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora) - logits /= rep_mask + + cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence, + self.settings.token_repetition_penalty_max, + self.settings.token_repetition_penalty_sustain, + self.settings.token_repetition_penalty_decay, + logits) tokens, probs = self.sample(logits, self.settings.temperature, @@ -515,13 +549,14 @@ def beam_search(self): beam.to_sequence() - rep_mask = self.make_rep_mask(self.settings.token_repetition_penalty_max, - self.settings.token_repetition_penalty_sustain, - self.settings.token_repetition_penalty_decay) - # self.cache.debug() logits = self.model.forward(self.sequence[:, -1:], self.cache, lora = self.lora) - logits /= rep_mask + + cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence, + self.settings.token_repetition_penalty_max, + self.settings.token_repetition_penalty_sustain, + self.settings.token_repetition_penalty_decay, + logits) tokens, probs = self.sample(logits, self.settings.temperature, diff --git a/exllama/lora.py b/exllama/lora.py index f279af3b..9dd1dd34 100644 --- a/exllama/lora.py +++ b/exllama/lora.py @@ -15,6 +15,7 @@ class ExLlamaLora: lora_scaling: float config: ExLlamaConfig tensors: dict[torch.tensor] + bias_ignored: bool def __init__(self, model, lora_config_path, lora_path): @@ -23,6 +24,7 @@ def __init__(self, model, lora_config_path, lora_path): self.model = model self.config = model.config self.tensors = {} + self.bias_ignored = False # Grab relevant items from LoRA config @@ -58,6 +60,13 @@ def __init__(self, model, lora_config_path, lora_path): decoder_layer = ks[4] lora_half = ks[5] + if lora_half == "bias": + epsilon = 1e-6 + if torch.max(tensor) > epsilon or torch.max(tensor) < -epsilon: + raise ValueError(f" ## Error: unsupported bias target {self.lora_path}: {key}") + self.bias_ignored = True + continue + target_module = self.model.layers[decoder_idx] if decoder_part == "self_attn": target_module = target_module.self_attn elif decoder_part == "mlp": target_module = target_module.mlp diff --git a/exllama/model.py b/exllama/model.py index efac3f8f..03a177bf 100644 --- a/exllama/model.py +++ b/exllama/model.py @@ -1,3 +1,10 @@ +import sys +min_version = (3, 9) +if sys.version_info < min_version: + print("") + print(f" ## Warning: this project requires Python {min_version[0]}.{min_version[1]} or higher.") + print("") + import torch from torch import nn import torch.nn.functional as F @@ -64,6 +71,7 @@ def __init__(self, model_config_path): # Optional settings self.max_seq_len = 2048 # Reduce to save memory. Can also be increased, but the pretrained models produce degenerate output after 2048 tokens in any case. Should be possible to finetune for longer sequence lengths. + self.compress_pos_emb = 1.0 # Increase to compress positional embeddings applied to sequence self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to move tensors via system RAM instead, where needed self.auto_map = None # List of floats with memory allocation in GB, per CUDA device, overrides device_map @@ -304,9 +312,9 @@ def fused(self, hidden_states, cache, buffer, input_layernorm, lora): # Project q, k, v, apply position embeddings to k and v, update cache - query_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device) - key_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device) - value_states = torch.empty((q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device) + query_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device) + key_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device) + value_states = torch.empty((bsz, q_len, self.config.hidden_size), dtype = torch.float16, device = hidden_states.device) cuda_ext.exllama_ext.q4_attn(hidden_states, input_layernorm.weight, @@ -398,8 +406,7 @@ def forward(self, hidden_states, cache, buffer, lora): attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) attn_weights /= math.sqrt(self.config.head_dim) - if buffer.attn_mask is not None and buffer.attn_mask.shape[2] > 1: attn_weights = attn_weights + buffer.attn_mask - # attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) + if buffer.attn_mask is not None: attn_weights = attn_weights + buffer.attn_mask attn_weights = nn.functional.softmax(attn_weights, dim = -1, dtype = torch.float16).to(query_states.dtype) attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2) @@ -412,7 +419,7 @@ def forward(self, hidden_states, cache, buffer, lora): # it can only apply a square attention mask. It saves quite a bit of VRAM but in practice Torch seems to use # the same amount of memory at peak anyway. - if past_len > 0: + if past_len > 0 or (bsz > 1 and buffer.attn_mask is not None): attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = buffer.attn_mask, is_causal = False) else: attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = None, is_causal = True) @@ -616,7 +623,7 @@ def _move_tensor(tensor, new_device, name, config): device = str(tensor.device) if device == new_device: return tensor if config.gpu_peer_fix: - if device.startswith("cuda:") and new_device.startswith("cuda:"): + if str(device).startswith("cuda:") and str(new_device).startswith("cuda:"): tensor = tensor.to("cpu") return tensor.to(new_device) @@ -740,6 +747,8 @@ def __init__(self, config): inv_freq = 1.0 / (self.config.rotary_embedding_base ** (torch.arange(0, self.config.head_dim, 2, device = device).float() / self.config.head_dim)) t = torch.arange(self.config.max_seq_len, device = device, dtype = torch.float32) + if self.config.compress_pos_emb != 1.0: t /= self.config.compress_pos_emb + freqs = torch.einsum("i,j->ij", t, inv_freq) emb = torch.cat((freqs, freqs), dim = -1) @@ -793,7 +802,9 @@ def __init__(self, config): torch.cuda.empty_cache() - def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None): + def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None, output_device = None, input_mask = None): + + assert input_mask is None or input_mask.shape == input_ids.shape # if torch.is_grad_enabled(): # raise ValueError("Forward pass called with gradients enabled. Back propagation is not supported yet.") @@ -801,6 +812,7 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False batch_size, seq_len = input_ids.shape past_len = cache.current_seq_len + if output_device is None: output_device = input_ids.device buffer = ExLlamaBuffer(self.config) @@ -811,9 +823,16 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False if seq_len > 1: attn_mask = torch.zeros(batch_size, 1, seq_len, past_len + seq_len, dtype = torch.float16, device = devs[0]) - attn_mask_triu = torch.triu(torch.full((seq_len - 1, seq_len - 1), torch.finfo(torch.float16).min)) + attn_mask_triu = torch.triu(torch.full((seq_len - 1, seq_len - 1), -65504.)) attn_mask[:, :, : seq_len - 1, past_len + 1: past_len + seq_len] = attn_mask_triu + if input_mask is not None: + + input_mask = _move_tensor(input_mask, devs[0], "input_mask", self.config) + input_mask = torch.where(input_mask, 0, -65504.).half() + input_mask = input_mask.unsqueeze(1).unsqueeze(2) + attn_mask = torch.minimum(attn_mask, input_mask) + else: attn_mask = None @@ -824,7 +843,7 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False # Embeddings # TODO: Allow passing input embeddings instead of IDs - input_ids = _move_tensor(input_ids, "cpu", "input_ids", self.config) + input_ids = _move_tensor(input_ids, self.config.device_map.embed_tokens, "input_ids", self.config) hidden_states = self.embed_tokens(input_ids) # Split buffers to devices @@ -863,5 +882,15 @@ def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False # logits = cuda_ext.matmul_half(hidden_states, self.lm_head_data, cublas = False) logits = logits.float() - logits = _move_tensor(logits, self.config.device_map.embed_tokens, "logits", self.config) + logits = _move_tensor(logits, output_device, "logits", self.config) return logits + + + # Free unmanaged resources allocated by the C++ extension. Call this before dereferencing the ExLlama object, + # e.g. if you intend to create a new instance to load another model, but don't call it in a destructor that wraps + # the object, since it relies on CUDA function calls and the CUDA context is one of the first things to go when + # a PyTorch application terminates, before other managed objects are destroyed. + + def free_unmanaged(self): + + cuda_ext.exllama_ext.cleanup() diff --git a/exllama/tokenizer.py b/exllama/tokenizer.py index b17e9f2c..c6964a3e 100644 --- a/exllama/tokenizer.py +++ b/exllama/tokenizer.py @@ -10,19 +10,52 @@ def __init__(self, tokenizer_model_path): self.tokenizer = SentencePieceProcessor(model_file = self.path) self.eos_token_id = self.tokenizer.eos_id() self.bos_token_id = self.tokenizer.bos_id() + self.pad_token_id = 0 self.newline_token_id = 13 + # Encode string + def encode(self, text): - ids = self.tokenizer.Encode(text) - return torch.tensor(ids).unsqueeze(0) + if isinstance(text, list): + + # text is a list of strings + + list_ids = self.tokenizer.Encode(text) + max_length = max([len(ids) for ids in list_ids]) + + padded_ids = [] + for ids in list_ids: + padding = torch.full((max_length - len(ids),), self.pad_token_id) + sequence = torch.tensor(ids) + padded_ids.append(torch.cat((padding, sequence), dim = 0)) + + return torch.stack(padded_ids, dim = 0) + + else: + + # text is a single string + + ids = self.tokenizer.Encode(text) + return torch.tensor(ids).unsqueeze(0) def decode(self, ids): - if not isinstance(ids, list): + if ids.dim() > 1: + + texts = [] + for i in range(ids.shape[0]): + seq = ids[i].tolist() + seq = [t for t in seq if t != self.pad_token_id] + if self.eos_token_id in seq: seq = seq[:seq.index(self.eos_token_id)] + texts.append(self.tokenizer.Decode(seq)) + return texts + + else: + ids = ids.tolist() - text = self.tokenizer.Decode(ids) - return text + text = self.tokenizer.Decode(ids) + return text def num_tokens(self, text): diff --git a/exllama_ext/cpu_func/rep_penalty.cpp b/exllama_ext/cpu_func/rep_penalty.cpp index c9ffe0b7..26c9dc16 100644 --- a/exllama_ext/cpu_func/rep_penalty.cpp +++ b/exllama_ext/cpu_func/rep_penalty.cpp @@ -1,4 +1,6 @@ #include "rep_penalty.h" +#include +#include void rep_penalty_cpu ( @@ -26,4 +28,47 @@ void rep_penalty_cpu if (v > rep_mask[t]) rep_mask[t] = v; if (--s < 0) v += dv; } +} + +bool* g_rep_mask = NULL; +int g_vocab_size = 0; + +void apply_rep_penalty_cpu +( + const int vocab_size, + const uint64_t* sequence, + const float penalty_max, + const int sustain, + const int decay, + const int seq_len, + float* logits +) +{ + if (vocab_size != g_vocab_size) + { + if (g_rep_mask) free(g_rep_mask); + g_vocab_size = vocab_size; + g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool)); + } + + memset(g_rep_mask, 0, g_vocab_size * sizeof(bool)); + + float v = penalty_max; + float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f; + + int s = sustain == -1 ? seq_len : sustain; + int beg = seq_len - sustain - decay; + if (beg < 0) beg = 0; + + for (int i = seq_len; i > beg;) + { + uint64_t t = sequence[--i]; + if (!g_rep_mask[t]) + { + if (logits[t] > 0.0) logits[t] /= v; + else logits[t] *= v; + g_rep_mask[t] = true; + } + if (--s < 0) v += dv; + } } \ No newline at end of file diff --git a/exllama_ext/cpu_func/rep_penalty.h b/exllama_ext/cpu_func/rep_penalty.h index 895ddd93..4f63b484 100644 --- a/exllama_ext/cpu_func/rep_penalty.h +++ b/exllama_ext/cpu_func/rep_penalty.h @@ -15,4 +15,16 @@ void rep_penalty_cpu const int seq_len ); +void apply_rep_penalty_cpu +( + const int vocab_size, + const uint64_t* sequence, + const float penalty_max, + const int sustain, + const int decay, + const int seq_len, + float* logits +); + + #endif diff --git a/exllama_ext/cuda_buffers.cu b/exllama_ext/cuda_buffers.cu index 7e7bd75d..73146958 100644 --- a/exllama_ext/cuda_buffers.cu +++ b/exllama_ext/cuda_buffers.cu @@ -35,6 +35,12 @@ CudaBuffers::CudaBuffers CudaBuffers::~CudaBuffers() { + cudaStreamDestroy(alt_stream_1); + cudaStreamDestroy(alt_stream_2); + cudaStreamDestroy(alt_stream_3); + cudaEventDestroy(alt_stream_1_done); + cudaEventDestroy(alt_stream_2_done); + cudaEventDestroy(alt_stream_3_done); } float* CudaBuffers::get_zeros_float(const int num_zeros) @@ -76,23 +82,14 @@ void prepare_buffers_cuda ); g_buffers[_device] = buffers; +} -// if (!q4_table_init) -// { -// for (uint v_zero = 0; v_zero < 16; v_zero++) -// { -// for (uint v_read = 0; v_read < 256; v_read++) -// { -// half v_0 = __float2half((float)((int)((v_read ) & 0x0f) - v_zero - 1)); -// half v_1 = __float2half((float)((int)((v_read >> 4) & 0x0f) - v_zero - 1)); -// half2 v_01 = {v_0, v_1}; -// q4_table_host[v_zero][v_read] = v_01; -// } -// } -// q4_table_init = true; -// } -// -// cudaSetDevice(_device); -// cudaMemcpyToSymbol(q4_table, q4_table_host, 16 * 256 * sizeof(half2)); -// cudaDeviceSynchronize(); +void cleanup_buffers_cuda() +{ + for (int i = 0; i < CUDA_MAX_DEVICES; i++) + { + if (!g_buffers[i]) continue; + delete g_buffers[i]; + g_buffers[i] = NULL; + } } diff --git a/exllama_ext/cuda_buffers.cuh b/exllama_ext/cuda_buffers.cuh index 86e8af82..8b3d451d 100644 --- a/exllama_ext/cuda_buffers.cuh +++ b/exllama_ext/cuda_buffers.cuh @@ -58,4 +58,6 @@ void prepare_buffers_cuda int _max_zeros_float ); +void cleanup_buffers_cuda(); + #endif \ No newline at end of file diff --git a/exllama_ext/cuda_func/half_matmul.cu b/exllama_ext/cuda_func/half_matmul.cu index 6f62d9a4..76ee1e43 100644 --- a/exllama_ext/cuda_func/half_matmul.cu +++ b/exllama_ext/cuda_func/half_matmul.cu @@ -130,7 +130,7 @@ const int S_THREADS_X = 8; // width const int S_THREADS_Z = 1; // height const int S_BLOCKSIZE = MAX_DIM_SMALL / 1024 * S_THREADS_X; // dim -template +template __global__ void half_matmul_small_kernel ( const half* __restrict__ x, @@ -163,7 +163,7 @@ __global__ void half_matmul_small_kernel const half* w_ptr = w_.item_ptr(k, column); half* out_ptr = out_.item_ptr(row, column); - if constexpr (use_half2) + if constexpr (use_half2 && !odd_rank) { half2* x_ptr2 = (half2*) x_ptr; half2* x_ptr2_end = (half2*) x_ptr_end; @@ -204,13 +204,22 @@ __global__ void half_matmul_small_kernel while(x_ptr < x_ptr_end) { - #pragma unroll - for (int i = 0; i < 4; ++i) + if constexpr (odd_rank) { half x_item = *x_ptr++; half w_item = *w_ptr; w_ptr += width; r = __hfma(x_item, w_item, r); } + else + { + #pragma unroll + for (int i = 0; i < 4; ++i) + { + half x_item = *x_ptr++; + half w_item = *w_ptr; w_ptr += width; + r = __hfma(x_item, w_item, r); + } + } } __shared__ half accum[MAX_DIM_SMALL / S_BLOCKSIZE][S_THREADS_X]; @@ -242,7 +251,7 @@ void half_matmul_small_cuda { bool use_half2 = !tuningParams->matmul_no_half2; - // printf("kernel: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width); + //printf("kernel: (%i, %i) @ (%i, %i) -> (%i, %i)\n", height, dim, dim, width, height, width); dim3 threads ( @@ -258,11 +267,18 @@ void half_matmul_small_cuda height ); - // printf("t... %i %i %i\n", threads.x, threads.y, threads.z); - // printf("b... %i %i %i\n", blocks.x, blocks.y, blocks.z); + //printf("t... %i %i %i\n", threads.x, threads.y, threads.z); + //printf("b... %i %i %i\n", blocks.x, blocks.y, blocks.z); //if (!no_zero) cudaMemsetAsync(out, 0, height * width * sizeof(half)); - if (use_half2) half_matmul_small_kernel <<>>(x, w, out, height, dim, width, no_zero); - else half_matmul_small_kernel<<>>(x, w, out, height, dim, width, no_zero); + if (dim & 0x03) + { + half_matmul_small_kernel <<>>(x, w, out, height, dim, width, no_zero); + } + else + { + if (use_half2) half_matmul_small_kernel <<>>(x, w, out, height, dim, width, no_zero); + else half_matmul_small_kernel <<>>(x, w, out, height, dim, width, no_zero); + } } diff --git a/exllama_ext/cuda_func/q4_attn.cu b/exllama_ext/cuda_func/q4_attn.cu index 54336962..5c6e1be4 100644 --- a/exllama_ext/cuda_func/q4_attn.cu +++ b/exllama_ext/cuda_func/q4_attn.cu @@ -87,6 +87,7 @@ void q4_attn_cuda Q4Matrix* v_proj, half* sin, half* cos, + const int bsz, const int q_len, const int dim, const int head_dim, @@ -119,7 +120,7 @@ void q4_attn_cuda ((num_heads + THREADS_Z - 1) / THREADS_Z + BLOCKSIZE_Z - 1) / BLOCKSIZE_Z ); - int _rows = q_len * num_heads; + int _rows_per_batch = q_len * num_heads; CudaBuffers* buffers = get_buffers(device_index); @@ -156,8 +157,8 @@ void q4_attn_cuda // Positional embeddings q, k - rope_cuda(tuningParams, query_states, sin, cos, _rows, head_dim, num_heads, past_len); - rope_cuda(tuningParams, key_states, sin, cos, _rows, head_dim, num_heads, past_len); + rope_cuda(tuningParams, query_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len); + rope_cuda(tuningParams, key_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len); // Update cache tensors with projected k, v @@ -177,13 +178,13 @@ void q4_attn_cuda // str_1: project q, positions q, sync q4_matmul_cuda(tuningParams, temp_x, q_len, q_proj, query_states, q_a ? true : false, str_1); - rope_cuda(tuningParams, query_states, sin, cos, _rows, head_dim, num_heads, past_len, str_1); + rope_cuda(tuningParams, query_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len, str_1); cudaEventRecord(sync_1, str_1); // str_2: project k, positions k, sync q4_matmul_cuda(tuningParams, temp_x, q_len, k_proj, key_states, k_a ? true : false, str_2); - rope_cuda(tuningParams, key_states, sin, cos, _rows, head_dim, num_heads, past_len, str_2); + rope_cuda(tuningParams, key_states, sin, cos, bsz, _rows_per_batch, head_dim, num_heads, past_len, str_2); cudaEventRecord(sync_2, str_2); // str_3: project v, wait for str_2, copy (k,v) to cache, sync diff --git a/exllama_ext/cuda_func/q4_attn.cuh b/exllama_ext/cuda_func/q4_attn.cuh index 7a6ae9ae..ce2e29fa 100644 --- a/exllama_ext/cuda_func/q4_attn.cuh +++ b/exllama_ext/cuda_func/q4_attn.cuh @@ -24,6 +24,7 @@ void q4_attn_cuda Q4Matrix* v_proj, half* sin, half* cos, + const int bsz, const int q_len, const int dim, const int head_dim, diff --git a/exllama_ext/cuda_func/q4_matmul.cu b/exllama_ext/cuda_func/q4_matmul.cu index 04047617..fe2343f7 100644 --- a/exllama_ext/cuda_func/q4_matmul.cu +++ b/exllama_ext/cuda_func/q4_matmul.cu @@ -245,6 +245,10 @@ void q4_matmul_recons_cuda const half alpha = __float2half(1.0f); const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f); - cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width); + +// const float alpha = 1.0f; +// const float beta = no_zero ? 1.0f : 0.0f; +// cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width, +// x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width); } diff --git a/exllama_ext/cuda_func/q4_matrix.cu b/exllama_ext/cuda_func/q4_matrix.cu index e856a978..d02fa0ec 100644 --- a/exllama_ext/cuda_func/q4_matrix.cu +++ b/exllama_ext/cuda_func/q4_matrix.cu @@ -20,6 +20,7 @@ void g_q4_keep_matrix(Q4Matrix* m) void g_q4_free_matrices() { for (const auto& m : g_q4_matrices) delete m; + g_q4_matrices.clear(); } Q4Matrix::Q4Matrix diff --git a/exllama_ext/cuda_func/rope.cu b/exllama_ext/cuda_func/rope.cu index 5179553c..ef64e9da 100644 --- a/exllama_ext/cuda_func/rope.cu +++ b/exllama_ext/cuda_func/rope.cu @@ -23,13 +23,14 @@ __global__ void rope_cuda_kernel half* __restrict__ x, const half* __restrict__ sin, const half* __restrict__ cos, - int rows, + int rows_per_batch, int head_dim, int num_heads, int past_len ) { - MatrixView_half_rw x_(x, rows, head_dim); + // These heights aren't used so it's okay if they're wrong. + MatrixView_half_rw x_(x, rows_per_batch, head_dim); MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim); MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim); @@ -37,7 +38,9 @@ __global__ void rope_cuda_kernel int column = (blockIdx.x * THREADS_X + threadIdx.x); if constexpr (use_half2) column *= 2; int row = blockIdx.y * THREADS_Y + threadIdx.y; - if (row >= rows) return; + if (row >= rows_per_batch) return; + int batch_offset = blockIdx.z * rows_per_batch; + int row_offset = batch_offset + row; // Get sin and cos @@ -54,14 +57,14 @@ __global__ void rope_cuda_kernel // Apply embedding to row - half2 item2_l = x_.item_half2(row, column); - half2 item2_r = x_.item_half2(row, column + half_dim); + half2 item2_l = x_.item_half2(row_offset, column); + half2 item2_r = x_.item_half2(row_offset, column + half_dim); half2 item2_ls = __hmul2(item2_r, sin2_l); half2 item2_rs = __hmul2(item2_l, sin2_r); item2_l = __hfma2(item2_l, cos2_l, item2_ls); item2_r = __hfma2(item2_r, cos2_r, item2_rs); - x_.set_half2(row, column, item2_l); - x_.set_half2(row, column + half_dim, item2_r); + x_.set_half2(row_offset, column, item2_l); + x_.set_half2(row_offset, column + half_dim, item2_r); } else { @@ -73,14 +76,14 @@ __global__ void rope_cuda_kernel // Apply embedding to row - half item_l = x_.item(row, column); - half item_r = x_.item(row, column + half_dim); + half item_l = x_.item(row_offset, column); + half item_r = x_.item(row_offset, column + half_dim); half item_ls = __hmul(item_r, sin_l); half item_rs = __hmul(item_l, sin_r); item_l = __hfma(item_l, cos_l, item_ls); item_r = __hfma(item_r, cos_r, item_rs); - x_.set(row, column, item_l); - x_.set(row, column + half_dim, item_r); + x_.set(row_offset, column, item_l); + x_.set(row_offset, column + half_dim, item_r); } } @@ -100,7 +103,8 @@ void rope_cuda half* x, const half* sin, const half* cos, - const int rows, + const int bsz, + const int rows_per_batch, const int head_dim, const int num_heads, const int past_len, @@ -112,10 +116,10 @@ void rope_cuda dim3 blocks ( (head_dim + THREADS_X - 1) / THREADS_X / 2 / (tuningParams->rope_no_half2 ? 1 : 2), - (rows + THREADS_Y - 1) / THREADS_Y, - 1 + (rows_per_batch + THREADS_Y - 1) / THREADS_Y, + int(bsz) ); fp_rope_cuda_kernel kernel = rope_cuda_kernel_pick(tuningParams); - kernel<<>>(x, sin, cos, rows, head_dim, num_heads, past_len); + kernel<<>>(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len); } diff --git a/exllama_ext/cuda_func/rope.cuh b/exllama_ext/cuda_func/rope.cuh index 64d5b88b..a0ffd33f 100644 --- a/exllama_ext/cuda_func/rope.cuh +++ b/exllama_ext/cuda_func/rope.cuh @@ -13,6 +13,7 @@ void rope_cuda half* x, const half* sin, const half* cos, + const int bsz, const int rows, const int head_dim, const int num_heads, diff --git a/exllama_ext/exllama_ext.cpp b/exllama_ext/exllama_ext.cpp index 280cf79c..615f7f4f 100644 --- a/exllama_ext/exllama_ext.cpp +++ b/exllama_ext/exllama_ext.cpp @@ -110,6 +110,16 @@ void set_tuning_params tuningParams.concurrent_streams = concurrent_streams; } + +// Release all unmanaged objects allocated by the extension + +void cleanup() +{ + cleanup_buffers_cuda(); + g_q4_free_matrices(); +} + + // Prepare buffers for forward pass void prepare_buffers @@ -408,12 +418,12 @@ void half_matmul_cublas void q4_attn ( - torch::Tensor x, // shape == (q_len, dim) + torch::Tensor x, // shape == (bsz, q_len, dim) torch::Tensor rms_norm_weight, // shape == (x.shape[1],) == (dim,) float epsilon, - torch::Tensor query_states, // shape == (q_len, dim) - torch::Tensor key_states, // shape == (q_len, dim) - torch::Tensor value_states, // shape == (q_len, dim) + torch::Tensor query_states, // shape == (bsz, q_len, dim) + torch::Tensor key_states, // shape == (bsz, q_len, dim) + torch::Tensor value_states, // shape == (bsz, q_len, dim) uintptr_t q_proj, uintptr_t k_proj, uintptr_t v_proj, @@ -438,7 +448,8 @@ void q4_attn TORCH_CHECK_DTYPE(query_states, kHalf); TORCH_CHECK_DTYPE(key_states, kHalf); - int dim = query_states.size(1); + int bsz = query_states.size(0); + int dim = query_states.size(2); torch::Device device = x.device(); int device_index = device.index(); @@ -467,6 +478,7 @@ void q4_attn reinterpret_cast(v_proj), (half*) sin.data_ptr(), (half*) cos.data_ptr(), + bsz, q_len, dim, head_dim, @@ -638,7 +650,8 @@ void rope_ TORCH_CHECK(head_dim == cos.size(-1), "cos table does not match head_dim"); TORCH_CHECK(head_dim == sin.size(-1), "sin table does not match head_dim"); - int rows = x.numel() / head_dim; + int bsz = x.size(0); + int rows_per_batch = x.numel() / head_dim / bsz; const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); @@ -648,7 +661,8 @@ void rope_ (half*) x.data_ptr(), (half*) sin.data_ptr(), (half*) cos.data_ptr(), - rows, + bsz, + rows_per_batch, head_dim, num_heads, past_len @@ -672,6 +686,8 @@ void rep_penalty int vocab_size = rep_mask.size(0); int seq_len = sequence.size(-1); + // TODO: Support batch size + rep_penalty_cpu ( vocab_size, @@ -684,10 +700,43 @@ void rep_penalty ); } +void apply_rep_penalty +( + torch::Tensor sequence, + float penalty_max, + int sustain, + int decay, + torch::Tensor logits +) +{ + TORCH_CHECK_DTYPE(sequence, kLong); + TORCH_CHECK_DTYPE(logits, kFloat); + TORCH_CHECK_SHAPES(sequence, 0, logits, 0, 1); + + int vocab_size = logits.size(-1); + int bsz = sequence.size(0); + int seq_len = sequence.size(-1); + + for (int i = 0; i < bsz; i++) + { + apply_rep_penalty_cpu + ( + vocab_size, + ((uint64_t*) sequence.data_ptr()) + i * seq_len, + penalty_max, + sustain, + decay, + seq_len, + ((float*) logits.data_ptr()) + i * vocab_size + ); + } +} + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("set_tuning_params", &set_tuning_params, "set_tuning_params"); m.def("prepare_buffers", &prepare_buffers, "prepare_buffers"); + m.def("cleanup", &cleanup, "cleanup"); m.def("make_q4", &make_q4, "make_q4"); m.def("q4_matmul", &q4_matmul, "q4_matmul"); m.def("q4_matmul_lora", &q4_matmul_lora, "q4_matmul_lora"); @@ -700,5 +749,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) m.def("half_matmul", &half_matmul, "half_matmul"); m.def("half_matmul_cublas", &half_matmul_cublas, "half_matmul_cublas"); - m.def("rep_penalty", &rep_penalty, "repetition penalty mask"); + m.def("rep_penalty", &rep_penalty, "rep_penalty"); + m.def("apply_rep_penalty", &apply_rep_penalty, "apply_rep_penalty"); } diff --git a/model_init.py b/model_init.py index 7238a461..d61dbac9 100644 --- a/model_init.py +++ b/model_init.py @@ -12,6 +12,8 @@ def add_args(parser): parser.add_argument("-gs", "--gpu_split", type = str, help = "Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. -gs 20,7,7") parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048) + parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0) + parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs") parser.add_argument("-mmrt", "--matmul_recons_thd", type = int, help = "No. rows at which to use reconstruction and cuBLAS for quant matmul. 0 = never, 1 = always", default = 8) @@ -74,6 +76,8 @@ def print_options(args, extra_options = None): print(f" -- Model config: {args.config}") print(f" -- Model: {args.model}") print(f" -- Sequence length: {args.length}") + if args.compress_pos_emb != 1.0: + print(f" -- RoPE compression factor: {args.compress_pos_emb}") print(f" -- Tuning:") print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else "")) @@ -98,6 +102,7 @@ def make_config(args): config.model_path = args.model config.max_seq_len = args.length + config.compress_pos_emb = args.compress_pos_emb config.set_auto_map(args.gpu_split) config.gpu_peer_fix = args.gpu_peer_fix diff --git a/perplexity.py b/perplexity.py index 593df6c1..84560998 100644 --- a/perplexity.py +++ b/perplexity.py @@ -32,54 +32,62 @@ def _begin(self): self.cache.current_seq_len = 0 - def _next_logits(self, input_ids, apply_lora, last_id_only=True): - n_logits = None + def _next_logits(self, input_ids, apply_lora, last_id_only = True): + n_logits = [] a = 0 while a < input_ids.shape[-1]: - b = min(input_ids.shape[-1], a + 2048) - n_logits = self.model.forward(input_ids[:, a:b], self.cache, last_id_only, lora = apply_lora) + b = min(input_ids.shape[-1], a + 2048) # TODO: Should this be a config parameter? + n_logits.append(self.model.forward(input_ids[:, a:b], self.cache, last_id_only, lora = apply_lora)) a = b - return n_logits + return torch.cat(n_logits, dim = 1) def _tokenize(self, text): return self.tokenizer.encode(text) - # This loads *and* tokenizes into chunks - def load(self, dataset_path, context=2048, overlap=0, minlength = 0): + # Load raw dataset from a text file and tokenize into chunks. Each chunk can optionally truncated to allow for + # evaluating the same data at different sequence lengths + + def load(self, dataset_path, chunk_size, chunk_truncate = None, overlap = 0, minlength = 0, json_key = "text"): + file_extension = os.path.splitext(dataset_path)[1] - # JSON format + # JSON format: Returned chunks may be of variable length, with each chunk representing one list item + if file_extension == '.jsonl' or file_extension == '.json': with open(dataset_path) as f: for line in f: - example = json.loads(line)["text"] + example = json.loads(line)[json_key] if len(example) > minlength: chunk = self._tokenize(example) - chunk = chunk[:, :context + 1] + chunk = chunk[:, :chunk_size] + if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate] self.dataset_chunks.append(chunk) - # Raw Text + + # Raw Text: Returned chunks are fixed length windows of the entire tokenized dataset + else: - with open(dataset_path) as f: + with open(dataset_path, encoding="utf-8") as f: text = f.read() tokens = self._tokenize(text) # overlap shouldn't be bigger than the context, also need at least one token for predicting last... - if overlap >= context: - overlap = context-2 + if overlap >= chunk_size: + overlap = chunk_size-2 # We can't use torch.chunks since it want's to split things into equal sized chunks. Instead, let's do our own chunking start = 0 while start < tokens.size(1): - chunk = tokens[:, start:start+context] - start += context - overlap + chunk = tokens[:, start:start + chunk_size] + start += chunk_size - overlap + if chunk_truncate is not None: chunk = chunk[:, :chunk_truncate] self.dataset_chunks.append(chunk) - def test(self, chunk_limit=sys.maxsize, lora = None, tag="", ppl_token = False): + def test(self, chunk_limit = sys.maxsize, lora = None, tag = "", ppl_token = False): if not self.dataset_chunks: sys.exit(" xx ERROR: Empty dataset!") @@ -126,3 +134,45 @@ def test(self, chunk_limit=sys.maxsize, lora = None, tag="", ppl_token = False): print("") print(f" ** Perplexity{tag}: {perplexity:.4f}") + + +def add_args(parser): + + parser.add_argument("-ppl", "--perplexity", nargs = '?', const = 'default', metavar = "METHOD", help = "Perplexity benchmark. Optionally specify method: gptq-for-llama, llama.cpp (not yet implemented)") + parser.add_argument("-ppl_ds", "--perplexity_dataset", metavar = "DATAPATH", type = str, help = "Load dataset for perplexity (JSONL if .jsonl, otherwise parses it as raw text)") + parser.add_argument("-ppl_cn", "--perplexity_chunk_num", nargs = "?", type = int, help = "Number of chunks for perplexity benchmark", default = 100) + parser.add_argument("-ppl_cs", "--perplexity_chunk_size", type = int, help = "Size of chunks for perplexity benchmark", default = 2048) + parser.add_argument("-ppl_ct", "--perplexity_chunk_truncate", type = int, help = "Truncated size of chunks for perplexity benchmark", default = 2048) + parser.add_argument("-ppl_co", "--perplexity_chunk_overlap", type = int, help = "Chunk overlap", default = 0) + parser.add_argument("-ppl_cm", "--perplexity_chunk_min", type = int, help = "Minimum chunk length", default = 50) + parser.add_argument("-ppl_key", "--perplexity_json_key", type = str, help = "Key to extract from JSON dataset, default: 'text'", default = "text") + parser.add_argument("-ppl_t", "--perplexity_token", action = "store_true", help = "Run perplexity test on individual tokens, for debug purposes (slow)") + + +def post_parse(args): + + if not args.perplexity: return + + # GPTQ-for-LLaMa equivalent + + if args.perplexity == "gptq-for-llama": + args.perplexity_dataset = "datasets/wikitext2.txt" + args.perplexity_chunk_num = 128 + args.perplexity_chunk_size = 2048 + args.perplexity_chunk_truncate = 2048 + args.perplexity_chunk_overlap = 0 + args.perplexity_chunk_min = 0 + + # Default dataset for legacy method + + if args.perplexity_dataset is None: args.perplexity_dataset = "datasets/wikitext2_val_sample.jsonl" + + print(f" -- Perplexity:") + print(f" -- - Dataset: {args.perplexity_dataset}") + print(f" -- - Chunks: {args.perplexity_chunk_num}") + print(f" -- - Chunk size: {args.perplexity_chunk_size}" + (f" -> {args.perplexity_chunk_truncate}" if args.perplexity_chunk_truncate is not None else "")) + print(f" -- - Chunk overlap: {args.perplexity_chunk_overlap}") + print(f" -- - Min. chunk size: {args.perplexity_chunk_min}") + print(f" -- - Key: {args.perplexity_json_key}") + if args.perplexity_token: print("f -- - Per-token mode") + diff --git a/requirements-web.txt b/requirements-web.txt new file mode 100644 index 00000000..d1bf2563 --- /dev/null +++ b/requirements-web.txt @@ -0,0 +1,2 @@ +flask==2.3.2 +waitress==2.1.2 diff --git a/setup.py b/setup.py index 94cc8596..13e3fa76 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name="exllama", - version="0.0.4", + version="0.0.5", install_requires=[ "torch", ], diff --git a/sh/test_chatbot.sh b/sh/test_chatbot.sh deleted file mode 100755 index 2595c58d..00000000 --- a/sh/test_chatbot.sh +++ /dev/null @@ -1,18 +0,0 @@ - -python test_chatbot.py -d /mnt/str/models/wizardlm-30b-uncensored-4bit-act-order/ -un "Maxine" -p prompt_assistant.txt -nnl -temp 1.0 -topp .75 - -#python test_chatbot.py \ -#-t /mnt/str/models/bluemoon-4k-13b-4bit-128g/tokenizer.model \ -#-c /mnt/str/models/bluemoon-4k-13b-4bit-128g/config.json \ -#-m /mnt/str/models/bluemoon-4k-13b-4bit-128g/bluemoonrp-13b-4k-epoch6-4bit-128g.safetensors \ -#-p prompt_bluemoon.txt \ -#-un "Player" \ -#-bn "DM" \ -#-bf \ -#-topk 30 \ -#-topp 0.45 \ -#-minp 0.1 \ -#-temp 1.4 \ -#-repp 1.3 \ -#-repps 256 \ -#-l 4096 \ No newline at end of file diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py new file mode 100644 index 00000000..0c438631 --- /dev/null +++ b/test_benchmark_inference.py @@ -0,0 +1,302 @@ +from model import ExLlama, ExLlamaCache, ExLlamaConfig +from tokenizer import ExLlamaTokenizer +from generator import ExLlamaGenerator +from lora import ExLlamaLora +import perplexity +from perplexity import Perplexity +import time +import torch +import torch.nn.functional as F +import argparse +import json +import math +import sys +import os +import glob +import model_init + +torch.cuda._lazy_init() +# torch.backends.cuda.matmul.allow_tf32 = True +# torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True +torch.set_printoptions(precision = 10) +torch_devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())] + +cache = None +model = None + +def begin(): + global model, cache + + if cache is None: cache = ExLlamaCache(model) + else: cache.current_seq_len = 0 + + +def next_logits(input_ids, apply_lora, last_id_only = True, input_mask = None): + global model, cache + + n_logits = None + a = 0 + while a < input_ids.shape[-1]: + b = min(input_ids.shape[-1], a + 2048) + n_logits = model.forward(input_ids[:, a:b], cache, last_id_only, lora = apply_lora, input_mask = input_mask) + a = b + + return n_logits + + +def tokenize(text): + global tokenizer + + return tokenizer.encode(text) + + +def timer(name, func): + t = time.time() + ret = func() + t = time.time() - t + print(f" ** Time, {name}: {t:.2f} seconds") + return ret + + +mem_base = {} +mem_last = {} +for dev in torch_devices: + torch.cuda.reset_peak_memory_stats(dev) + mem_base[dev] = mem_last[dev] = torch.cuda.max_memory_allocated(dev) + +def mem(name, total = False): + global mem_base, mem_last + + res = f" ** VRAM, {name}: " + first = True + + for device in torch_devices: + mem_c = torch.cuda.max_memory_allocated(device) + mem_this = mem_c - mem_last[device] if not total else mem_c - mem_base[device] + mem_last[device] = mem_c + + if not first: res += " - " + first = False + res += f"[{device}] {mem_this / (1024 ** 2):,.2f} MB" + + print(res) + + +# Parse arguments + +parser = argparse.ArgumentParser(description = "Benchmark tests for ExLlama") + +model_init.add_args(parser) +perplexity.add_args(parser) + +parser.add_argument("-p", "--perf", action = "store_true", help = "Benchmark speed and VRAM usage") +parser.add_argument("-v", "--validate", action = "count", help = "Run validation check and generate some sample output; specify twice for a more thorough test") +parser.add_argument("-lora", "--lora", type = str, help = "Path to LoRA binary to use during benchmark") +parser.add_argument("-loracfg", "--lora_config", type = str, help = "Path to LoRA config to use during benchmark") +parser.add_argument("-ld", "--lora_dir", type = str, help = "Path to LoRA config and binary. to use during benchmark") + +args = parser.parse_args() + +model_init.post_parse(args) +perplexity.post_parse(args) +model_init.get_model_files(args) + +# Paths + +if args.lora_dir is not None: + args.lora_config = os.path.join(args.lora_dir, "adapter_config.json") + args.lora = os.path.join(args.lora_dir, "adapter_model.bin") + +# Feedback + +print_opts = [] +if args.perf: print_opts.append("perf") +if args.validate: print_opts.append("validate") +if args.perplexity: print_opts.append("perplexity") +if args.perplexity_token: print_opts.append("perplexity_token") + +model_init.print_options(args, print_opts) + +# Instantiate model + +config = model_init.make_config(args) + +model = timer("Load model", lambda: ExLlama(config)) +tokenizer = timer("Load tokenizer", lambda: ExLlamaTokenizer(args.tokenizer)) + +model_init.print_stats(model) + +torch.cuda.reset_peak_memory_stats("cuda") +mem("Model") + +# Load LoRA + +lora = None +if args.lora: + print(f" -- LoRA config: {args.lora_config}") + print(f" -- Loading LoRA: {args.lora}") + if args.lora_config is None: + print(f" ## Error: please specify lora path to adapter_config.json") + sys.exit() + lora = ExLlamaLora(model, args.lora_config, args.lora) + if lora.bias_ignored: + print(f" !! Warning: LoRA zero bias ignored") + +# Test sequence + +gen_tokens = 128 +max_seq_len = args.length +ids = torch.randint(0, 31999, (1, max_seq_len - gen_tokens)).cuda() + +# Benchmark memory and performance + +if args.perf: + + # Warming up apparently makes a huge difference + + for i in range(1, 3): + print(f" -- Warmup pass {i}...") + begin() + logits = timer("Warmup", lambda: next_logits(ids, lora)) + + # Do the actual benchmark + + begin() + + t = time.time() + + print(" -- Inference, first pass.") + logits = timer("Inference", lambda: next_logits(ids, lora)) + + t = time.time() - t + print(f" ** Speed: {ids.shape[-1] / t:.2f} tokens/second") + + for j in range(2): + + t = time.time() + print(f" -- Generating {gen_tokens} tokens, {ids.shape[-1]} token prompt...") + for i in range(gen_tokens): + + logits = logits[0, -1, :] + token = torch.argmax(logits) + next_id = token.unsqueeze(0).unsqueeze(0) + logits = next_logits(next_id, lora) + + t = time.time() - t + print(f" ** Speed: {gen_tokens / t:.2f} tokens/second") + + ids = ids[:, :4] + cache.current_seq_len = 4 + + mem("Inference") + mem("Total", total = True) + + +# Benchmark perplexity + +if args.perplexity: + + ppl = Perplexity(args.perplexity, model, cache, tokenizer) + + print(" -- Loading dataset...") + + ppl.load(dataset_path = args.perplexity_dataset, + chunk_size = args.perplexity_chunk_size, + chunk_truncate = args.perplexity_chunk_truncate, + overlap = args.perplexity_chunk_overlap, + minlength = args.perplexity_chunk_min, + json_key = args.perplexity_json_key) + + begin() + + ppl.test(args.perplexity_chunk_num, + lora = lora, + ppl_token = args.perplexity_token) + +# Validate file + +if args.validate: + + ppl = Perplexity(args.perplexity, model, cache, tokenizer) + + ppl.load(dataset_path = "datasets/wikitext2_val_sample.jsonl", + chunk_size = 2048, + chunk_truncate = 2048, + overlap = 0, + minlength = 50, + json_key = "text") + + # Short perplexity tests in switched and quant mode, should produce roughly equal results + + begin() + + model.config.matmul_recons_thd = 1 + ppl.test(8, lora = lora, tag = " (reconstruct)") + model.config.matmul_recons_thd = 0 + ppl.test(8, lora = lora, tag = " (quant, token)", ppl_token = True) + + # Do a short, easy topk=1 completion to see if we're generating garbage. Should run in switched mode + # for the prompt and quant for individual tokens + + model.config.matmul_recons_thd = 4 + generator = ExLlamaGenerator(model, tokenizer, cache) + generator.settings.top_k = 1 + generator.lora = lora + text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20 * args.validate) + print(f" ** Generation: {repr(text)}") + + if args.validate > 1: + + # Test batched generation + + bsz = 8 + gen_len = 20 + torch.manual_seed(42) + torch.cuda.manual_seed_all(42) + + # Bigger cache for the batch + + del cache + cache = ExLlamaCache(model, batch_size = bsz) + + # Create tokenized batch and attention mask + + identical_batch_prompt = "When you have eliminated the impossible, whatever remains," + continuations = [ + " must be considered", + " ought to be", + " (and some scholars say this is", + " however improbable, is a banana.", + ] + + prompts = [identical_batch_prompt] * (bsz - len(continuations)) + for cont in continuations: + prompts.append(identical_batch_prompt + cont) + + ids = tokenizer.encode(prompts) + assert ids.shape[1] < model.config.max_seq_len, f"Max length {ids.shape[1]} exceeds model limit {model.config.max_seq_len}" + + mask = ids.ne(tokenizer.pad_token_id) + + # Batched generation with greedy sampling + + sequence = torch.empty((bsz, 0), dtype = torch.long, device = "cpu") + logits = next_logits(ids, lora, input_mask = mask) + + for i in range(gen_len): + logits = logits[:, -1, :] + id_per_batch = torch.argmax(logits, dim=-1) + assert id_per_batch.shape == (bsz,), f"{id_per_batch.shape} != {(bsz,)}" + next_id_per_batch = id_per_batch.unsqueeze(-1) + sequence = torch.cat((sequence, next_id_per_batch), dim = -1) + logits = next_logits(next_id_per_batch, lora) + + # Print output batch + + print(f"\n ** Batching sanity check: 1-{bsz - len(continuations)} should be identical. All should be reasonable for the model you're using.\n") + + outputs = tokenizer.decode(sequence) + for b in range(bsz): + print(f"{b + 1} {repr(prompts[b])} -> {repr(outputs[b])}") + + # TODO Save the logits and then rerun each prompt with a batch size of 1, same input. The logits should be identical. diff --git a/webui/app.py b/webui/app.py index f921cf33..c2fa909f 100644 --- a/webui/app.py +++ b/webui/app.py @@ -1,6 +1,7 @@ import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from model import ExLlama, ExLlamaConfig from flask import Flask, render_template, request, jsonify from flask import Response, stream_with_context from threading import Timer, Lock @@ -10,7 +11,7 @@ from session import prepare_sessions, get_initial_session, Session, load_session, new_session, _sessions_dir import argparse from tokenizer import ExLlamaTokenizer -from model import ExLlama, ExLlamaConfig +from waitress import serve app = Flask(__name__) app.static_folder = 'static' @@ -117,12 +118,17 @@ def api_userinput(): result = Response(stream_with_context(session.respond_multi(user_input)), mimetype = 'application/json') return result +@app.route("/api/append_block", methods=['POST']) +def api_append_block(): + data = request.get_json() + session.api_append_block(data) + return json.dumps({"result": "ok"}) + "\n" # Load the model parser = argparse.ArgumentParser(description="Simple web-based chatbot for ExLlama") parser.add_argument("-host", "--host", type = str, help = "IP:PORT eg, 0.0.0.0:7862", default = "localhost:5000") -parser.add_argument("-sd", "--sessions-dir", type = str, help = "Location for storing user sessions, default: ~/exllama_sessions/", default = "~/exllama_sessions/") +parser.add_argument("-sd", "--sessions_dir", type = str, help = "Location for storing user sessions, default: ~/exllama_sessions/", default = "~/exllama_sessions/") model_init.add_args(parser) args = parser.parse_args() @@ -155,4 +161,4 @@ def api_userinput(): if host == "localhost": Timer(1, lambda: webbrowser.open(f'http://{machine}/')).start() -app.run(host = host, port = port) \ No newline at end of file +serve(app, host = host, port = port) \ No newline at end of file diff --git a/webui/session.py b/webui/session.py index 4b07993f..5a715113 100644 --- a/webui/session.py +++ b/webui/session.py @@ -282,22 +282,23 @@ def api_populate(self): jnode["author_idx"] = self.participants.index(author) dic = {"sessions": names, - "current_session": name, - "fixed_prompt": self.fixed_prompt.text, - "keep_fixed_prompt": self.keep_fixed_prompt, - "participants": self.participants, - "history": historyjson, - "temperature": generator.settings.temperature, - "top_p": generator.settings.top_p, - "min_p": generator.settings.min_p, - "top_k": generator.settings.top_k, - "typical": generator.settings.typical, - "break_on_newline": self.break_on_newline, - "max_response_tokens": self.max_response_tokens, - "chunk_size": self.chunk_size, - "token_repetition_penalty_max": generator.settings.token_repetition_penalty_max, - "token_repetition_penalty_sustain": generator.settings.token_repetition_penalty_sustain, - "token_repetition_penalty_decay": generator.settings.token_repetition_penalty_decay} + "current_session": name, + "fixed_prompt": self.fixed_prompt.text, + "keep_fixed_prompt": self.keep_fixed_prompt, + "participants": self.participants, + "history": historyjson, + "temperature": generator.settings.temperature, + "top_p": generator.settings.top_p, + "min_p": generator.settings.min_p, + "top_k": generator.settings.top_k, + "typical": generator.settings.typical, + "break_on_newline": self.break_on_newline, + "max_response_tokens": self.max_response_tokens, + "chunk_size": self.chunk_size, + "token_repetition_penalty_max": generator.settings.token_repetition_penalty_max, + "token_repetition_penalty_sustain": generator.settings.token_repetition_penalty_sustain, + "token_repetition_penalty_decay": generator.settings.token_repetition_penalty_decay, + "max_seq_len": model.config.max_seq_len} # Add model info @@ -336,6 +337,22 @@ def api_edit_block(self, data): return + def api_append_block(self, data): + + author = None + if "author" in data: + author = data["author"] + else: + if len(self.participants) > 0: + author = self.participants[0] + + text = data["text"].strip() + + newNode = Node(text, author) + self.history.append(newNode) + self.save() + + def api_set_participants(self, data): self.participants = data["participants"] @@ -599,7 +616,8 @@ def respond_multi(self, user_input): end_time = time.time() elapsed = end_time - begin_time new_tokens = context.shape[-1] - reused - print(f"Prompt processed in {elapsed:.2f} seconds, {new_tokens} new tokens, {(new_tokens / elapsed):.2f} tokens/second:") + token_rate = 0 if elapsed == 0 else (new_tokens / elapsed) + print(f"Prompt processed in {elapsed:.2f} seconds, {new_tokens} new tokens, {token_rate:.2f} tokens/second:") begin_time = time.time() total_tokens = [0] @@ -676,8 +694,9 @@ def respond_multi(self, user_input): end_time = time.time() elapsed = end_time - begin_time + token_rate = 0 if elapsed == 0 else (total_tokens[0] / elapsed) - print(f"Response generated in {elapsed:.2} seconds, {total_tokens[0]} tokens, {(total_tokens[0] / elapsed):.2f} tokens/second:") + print(f"Response generated in {elapsed:.2} seconds, {total_tokens[0]} tokens, {token_rate:.2f} tokens/second:") self.save() diff --git a/webui/static/main.js b/webui/static/main.js index 9ec07785..09cf1ce1 100644 --- a/webui/static/main.js +++ b/webui/static/main.js @@ -65,13 +65,15 @@ function sendGenSettings() { json.token_repetition_penalty_sustain = getTBNumber("sl_repp_sustain_tb"); json.token_repetition_penalty_decay = getTBNumber("sl_repp_decay_tb"); - console.log(json); + // console.log(json); send("/api/set_gen_settings", json); } -function setSlider(id, value) { +function setSlider(id, value, override_max = null) { let slider = document.getElementById(id); + if (override_max) slider.max = override_max; + let tb = document.getElementById(id + "_tb"); let decimals = slider.dataset.decimals; let mult = Math.pow(10, decimals); @@ -256,6 +258,8 @@ function populate() { let tf_model_info = document.getElementById("tf_model_info") tf_model_info.value = data.model_info; + let model_max_seq_lan = data.max_seq_len; + // Fixed prompt let tf_fixed_prompt = document.getElementById("tf_fixed_prompt") @@ -285,7 +289,7 @@ function populate() { // Repetition penalty setSlider("sl_repp_penalty", data.token_repetition_penalty_max); - setSlider("sl_repp_sustain", data.token_repetition_penalty_sustain); + setSlider("sl_repp_sustain", data.token_repetition_penalty_sustain, model_max_seq_lan); setSlider("sl_repp_decay", data.token_repetition_penalty_decay); // Participants