Blaizzy · Blaizzy · May 5, 2024 · May 5, 2024 · May 5, 2024
diff --git a/README.md b/README.md
@@ -13,8 +13,31 @@ The easiest way to get started is to install the `mlx-vlm` package:
 pip install mlx-vlm
 ```
 
-**Inference**:
+## Inference
 
+**CLI**
 ```sh
 python -m mlx_vlm.generate --model qnguyen3/nanoLLaVA --max-tokens 100 --temp 0.0
+```
+
+**Chat UI with Gradio**
+```sh
+python -m mlx_vlm.chat_ui --model qnguyen3/nanoLLaVA
+```
+
+**Script**
+```python
+import mlx.core as mx
+from mlx_vlm import load, generate
+
+model_path = "mlx-community/llava-1.5-7b-4bit"
+model, processor = load(model_path)
+
+prompt = processor.apply_chat_template(
+    [{"role": "user", "content": f"<image>\nWhat are these?"}],
+    tokenize=False,
+    add_generation_prompt=True,
+)
+
+output = generate(model, processor, "http://images.cocodataset.org/val2017/000000039769.jpg", prompt, verbose=False)
 ```
diff --git a/mlx_vlm/chat_ui.py b/mlx_vlm/chat_ui.py
@@ -0,0 +1,142 @@
+import argparse
+from typing import Optional
+
+import gradio as gr
+import mlx.core as mx
+
+from mlx_vlm import load
+
+from .prompt_utils import get_message_json
+from .utils import (
+    generate_step,
+    load,
+    load_config,
+    load_image_processor,
+    prepare_inputs,
+    sample,
+)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Generate text from an image using a model."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="qnguyen3/nanoLLaVA",
+        help="The path to the local model directory or Hugging Face repo.",
+    )
+    return parser.parse_args()
+
+
+args = parse_arguments()
+config = load_config(args.model)
+model, processor = load(args.model, {"trust_remote_code": True})
+image_processor = load_image_processor(args.model)
+
+
+def generate(
+    model,
+    processor,
+    image: str,
+    prompt: str,
+    image_processor=None,
+    temp: float = 0.0,
+    max_tokens: int = 100,
+    repetition_penalty: Optional[float] = None,
+    repetition_context_size: Optional[int] = None,
+    top_p: float = 1.0,
+):
+
+    if image_processor is not None:
+        tokenizer = processor
+    else:
+        tokenizer = processor.tokenizer
+
+    input_ids, pixel_values = prepare_inputs(image_processor, processor, image, prompt)
+    logits, cache = model(input_ids, pixel_values)
+    logits = logits[:, -1, :]
+    y, _ = sample(logits, temp, top_p)
+
+    detokenizer = processor.detokenizer
+    detokenizer.reset()
+
+    detokenizer.add_token(y.item())
+
+    for (token, _), n in zip(
+        generate_step(
+            model.language_model,
+            logits,
+            cache,
+            temp,
+            repetition_penalty,
+            repetition_context_size,
+            top_p,
+        ),
+        range(max_tokens),
+    ):
+        token = token.item()
+
+        if token == tokenizer.eos_token_id:
+            break
+
+        detokenizer.add_token(token)
+        detokenizer.finalize()
+        yield detokenizer.last_segment
+
+
+def chat(message, history, temperature, max_tokens):
+
+    chat = []
+    for item in history:
+        chat.append(get_message_json(config["model_type"], item[0]))
+        if item[1] is not None:
+            chat.append({"role": "assistant", "content": item[1]})
+
+    if message["files"]:
+        chat.append(get_message_json(config["model_type"], message["text"]))
+
+    messages = processor.apply_chat_template(
+        chat,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    response = ""
+    for chunk in generate(
+        model,
+        processor,
+        message["files"][0],
+        messages,
+        image_processor,
+        temperature,
+        max_tokens,
+    ):
+        response += chunk
+        yield response
+
+
+demo = gr.ChatInterface(
+    fn=chat,
+    title="MLX-VLM Chat UI",
+    additional_inputs_accordion=gr.Accordion(
+        label="⚙️ Parameters", open=False, render=False
+    ),
+    additional_inputs=[
+        gr.Slider(
+            minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
+        ),
+        gr.Slider(
+            minimum=128,
+            maximum=4096,
+            step=1,
+            value=200,
+            label="Max new tokens",
+            render=False,
+        ),
+    ],
+    description=f"Now Running {args.model}",
+    multimodal=True,
+)
+
+demo.launch(inbrowser=True)
diff --git a/mlx_vlm/utils.py b/mlx_vlm/utils.py
@@ -128,7 +128,7 @@ def load_model(model_path: Path, lazy: bool = False) -> nn.Module:
 
 model_id= "<huggingface_model_id>"
 model = AutoModelForCausalLM.from_pretrained(model_id)
-processor = AutoProcessor.from_pretrained(model_id) 
+processor = AutoProcessor.from_pretrained(model_id)
 
 model.save_pretrained("<local_dir>")
 processor.save_pretrained("<local_dir>")
@@ -233,7 +233,11 @@ def load(
     return model, processor
 
 
-def load_config(model_path: Path) -> dict:
+def load_config(model_path: Union[str, Path]) -> dict:
+
+    if isinstance(model_path, str):
+        model_path = get_model_path(model_path)
+
     try:
         with open(model_path / "config.json", "r") as f:
             config = json.load(f)

diff --git a/requirements.txt b/requirements.txt
@@ -4,5 +4,6 @@ numpy
 transformers
 torch
 huggingface_hub
+gradio
 Pillow
 requests