Add video-llama LVM microservice under lvms (opea-project#495)

Signed-off-by: BaoHuiling <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: srinarayan-srikanthan <[email protected]>
siddhivelankar23 · Sep 1, 2024 · bb01c19 · bb01c19
1 parent 389fb61
commit bb01c19
Show file tree

Hide file tree

Showing 18 changed files with 812 additions and 1 deletion.
diff --git a/comps/__init__.py b/comps/__init__.py
@@ -18,6 +18,7 @@
     RAGASScores,
     GraphDoc,
     LVMDoc,
+    LVMVideoDoc,
 )
 
 # Constants

diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from docarray import BaseDoc, DocList
-from docarray.documents import AudioDoc
+from docarray.documents import AudioDoc, VideoDoc
 from docarray.typing import AudioUrl
 from pydantic import Field, conint, conlist, field_validator
 
@@ -171,3 +171,11 @@ class LVMDoc(BaseDoc):
     temperature: float = 0.01
     repetition_penalty: float = 1.03
     streaming: bool = False
+
+
+class LVMVideoDoc(BaseDoc):
+    video_url: str
+    chunk_start: float
+    chunk_duration: float
+    prompt: str
+    max_new_tokens: conint(ge=0, le=1024) = 512
diff --git a/comps/lvms/video-llama/Dockerfile b/comps/lvms/video-llama/Dockerfile
@@ -0,0 +1,18 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+# Set environment variables
+ENV LANG=en_US.UTF-8
+
+COPY comps /home/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/comps/lvms/video-llama/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home
+
+WORKDIR /home/comps/lvms/video-llama
+
+ENTRYPOINT ["python", "lvm.py"]
diff --git a/comps/lvms/video-llama/README.md b/comps/lvms/video-llama/README.md
@@ -0,0 +1,70 @@
+# LVM Microservice
+
+This is a Docker-based microservice that runs Video-Llama as a Large Vision Model (LVM). It utilizes Llama-2-7b-chat-hf for conversations based on video dialogues. It support Intel Xeon CPU.
+
+# 🚀1. Start Microservice with Docker
+
+## 1.1 Build Images
+
+```bash
+cd GenAIComps
+# Video-Llama Server Image
+docker build --no-cache -t opea/video-llama-lvm-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/video-llama/server/docker/Dockerfile .
+# LVM Service Image
+docker build --no-cache -t opea/lvm-video-llama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy  -f comps/lvms/video-llama/Dockerfile .
+```
+
+## 1.2 Start Video-Llama and LVM Services
+
+For the very first run, please follow below steps:
+
+```bash
+# prepare environment variables
+export ip_address=$(hostname -I | awk '{print $1}')
+export no_proxy=$no_proxy,${ip_address}
+export LVM_ENDPOINT=http://${ip_address}:9009
+# Start service
+docker compose -f comps/lvms/video-llama/docker_compose.yaml up -d
+# it should take about 1.5 hours for the model to download in the video-llama server, assuming a maximum download speed of 100 Mbps
+until docker logs video-llama-lvm-server 2>&1 | grep -q "Uvicorn running on"; do
+    sleep 5m
+done
+```
+
+If you've run the microservice before, it's recommended to keep the downloaded model so it won't be redownloaded each time you run it. To achieve this, you need to modify the following configuration:
+
+```yaml
+# comps/lvms/video-llama/docker_compose.yaml
+services:
+  lvm-video-llama:
+    ...
+    environment:
+      llm_download: "False" # avoid download
+```
+
+# ✅ 2. Test
+
+```bash
+# use curl
+export ip_address=$(hostname -I | awk '{print $1}')
+## check video-llama
+http_proxy="" curl -X POST "http://${ip_address}:9009/generate?video_url=https%3A%2F%2Fgithub.com%2FDAMO-NLP-SG%2FVideo-LLaMA%2Fraw%2Fmain%2Fexamples%2Fsilence_girl.mp4&start=0.0&duration=9&prompt=What%20is%20the%20person%20doing%3F&max_new_tokens=150" -H "accept: */*" -d ''
+
+## check lvm
+http_proxy="" curl -X POST http://${ip_address}:9000/v1/lvm -d '{"video_url":"https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4","chunk_start": 0,"chunk_duration": 9,"prompt":"What is the person doing?","max_new_tokens": 150}' -H 'Content-Type: application/json'
+
+# or use python
+export ip_address=$(hostname -I | awk '{print $1}')
+python comps/lvms/video-llama/check_lvm.py
+```
+
+# ♻️ 3. Clean
+
+```bash
+# remove the container
+cid=$(docker ps -aq --filter "name=video-llama")
+if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+# remove the model volume (suggest to keep this to avoid download for each run)
+if docker volume ls | grep -q video-llama-model; then docker volume rm video-llama_video-llama-model; fi
+
+```
diff --git a/comps/lvms/video-llama/check_lvm.py b/comps/lvms/video-llama/check_lvm.py
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import datetime
+import json
+import os
+
+import requests
+
+ip_address = os.getenv("ip_address")
+####### video-llama request ########
+print("video-llama request")
+api_url = f"http://${ip_address}:9009/generate"
+content = {
+    "video_url": "https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4",
+    "start": 0.0,
+    "duration": 9,
+    "prompt": "What is the person doing?",
+    "max_new_tokens": 150,
+}
+
+start = datetime.datetime.now()
+with requests.post(api_url, params=content, stream=True) as response:
+    for chunk in response.iter_content(chunk_size=8192):
+        if chunk:
+            print(chunk.decode("utf-8"), end="", flush=True)  # Flush to ensure immediate output
+
+end = datetime.datetime.now()
+print(f"\nTotal time: {end - start}")
+
+####### lvm request ########
+print("lvm request")
+api_url = f"http://${ip_address}:9000/v1/lvm"
+headers = {"Content-Type": "application/json"}
+data = {
+    "video_url": "https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4",
+    "chunk_start": 0,
+    "chunk_duration": 9,
+    "prompt": "what is the person doing",
+    "max_new_tokens": 150,
+}
+
+start = datetime.datetime.now()
+with requests.post(api_url, headers=headers, data=json.dumps(data), stream=True) as response:
+    for chunk in response.iter_content(chunk_size=8192):
+        if chunk:
+            print(chunk.decode("utf-8"), end="", flush=True)  # Flush to ensure immediate output
+
+end = datetime.datetime.now()
+print(f"\nTotal time: {end - start}")
diff --git a/comps/lvms/video-llama/docker_compose.yaml b/comps/lvms/video-llama/docker_compose.yaml
@@ -0,0 +1,40 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3"
+services:
+  lvm-video-llama:
+    image: opea/video-llama-lvm-server:latest
+    container_name: video-llama-lvm-server
+    ports:
+      - "9009:9009"
+    ipc: host
+    environment:
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
+      llm_download: "True"
+    volumes:
+      - "/home/$USER/.cache:/home/user/.cache" # RECOMMENDED: use local cache to avoid download
+      - video-llama-model:/home/user/model
+    restart: unless-stopped
+
+  lvm:
+    image: opea/lvm-video-llama:latest
+    container_name: lvm-video-llama
+    ports:
+      - "9000:9000"
+    ipc: host
+    environment:
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
+      LVM_ENDPOINT: ${LVM_ENDPOINT}
+    restart: unless-stopped
+    depends_on:
+      - lvm-video-llama
+networks:
+  default:
+    driver: bridge
+volumes:
+  video-llama-model:
diff --git a/comps/lvms/video-llama/lvm.py b/comps/lvms/video-llama/lvm.py
@@ -0,0 +1,80 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+# import json
+import logging
+import os
+
+import requests
+from fastapi import HTTPException
+from fastapi.responses import StreamingResponse
+
+from comps import LVMVideoDoc, ServiceType, opea_microservices, register_microservice, register_statistics
+
+# import time
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+@register_microservice(
+    name="opea_service@lvm",
+    service_type=ServiceType.LVM,
+    endpoint="/v1/lvm",
+    host="0.0.0.0",
+    port=9000,
+    input_datatype=LVMVideoDoc,
+    output_datatype=StreamingResponse,
+)
+@register_statistics(names=["opea_service@lvm"])
+async def lvm(input: LVMVideoDoc):
+    """This function handles the LVM microservice, which generates text based on a video URL, start time, duration, prompt, and maximum new tokens.
+
+    Parameters:
+    input (LVMVideoDoc): The input containing the video URL, start time, duration, prompt, and maximum new tokens.
+
+    Returns:
+    StreamingResponse: A streaming response containing the generated text in text/event-stream format, or a JSON error response if the upstream API responds with an error.
+    """
+    logging.info("[lvm] Received input")
+
+    video_url = input.video_url
+    chunk_start = input.chunk_start
+    chunk_duration = input.chunk_duration
+    prompt = input.prompt
+    max_new_tokens = input.max_new_tokens
+
+    params = {
+        "video_url": video_url,
+        "start": chunk_start,
+        "duration": chunk_duration,
+        "prompt": prompt,
+        "max_new_tokens": max_new_tokens,
+    }
+    logging.info(f"[lvm] Params: {params}")
+
+    response = requests.post(url=f"{lvm_endpoint}/generate", params=params, proxies={"http": None}, stream=True)
+    logging.info(f"[lvm] Response status code: {response.status_code}")
+    if response.status_code == 200:
+
+        def streamer():
+            yield f"{{'video_url': '{video_url}', 'chunk_start': {chunk_start}, 'chunk_duration': {chunk_duration}}}\n".encode(
+                "utf-8"
+            )
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    yield chunk
+                logging.info(f"[llm - chat_stream] Streaming: {chunk}")
+            logging.info("[llm - chat_stream] stream response finished")
+
+        return StreamingResponse(streamer(), media_type="text/event-stream")
+    else:
+        logging.error(f"[lvm] Error: {response.text}")
+        raise HTTPException(status_code=500, detail="The upstream API responded with an error.")
+
+
+if __name__ == "__main__":
+    lvm_endpoint = os.getenv("LVM_ENDPOINT")
+
+    opea_microservices["opea_service@lvm"].start()
diff --git a/comps/lvms/video-llama/requirements.txt b/comps/lvms/video-llama/requirements.txt
@@ -0,0 +1,11 @@
+datasets
+docarray
+fastapi
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+Pillow
+prometheus-fastapi-instrumentator
+pydub
+shortuuid
+uvicorn
diff --git a/comps/lvms/video-llama/server/data/silence_girl.mp4 b/comps/lvms/video-llama/server/data/silence_girl.mp4
diff --git a/comps/lvms/video-llama/server/docker/Dockerfile b/comps/lvms/video-llama/server/docker/Dockerfile
@@ -0,0 +1,38 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.9-slim
+
+ENV LANG=C.UTF-8
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    git git-lfs && \ 
+    git lfs install
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user:user /home/user/
+RUN mkdir /home/user/model && chown user:user -R /home/user/model
+
+USER user
+
+COPY --chown=user:user comps /home/user/comps
+WORKDIR /home/user/comps/lvms/video-llama/server
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/comps/lvms/video-llama/server/requirements.txt
+
+ARG VIDEO_LLAMA_REPO=https://github.com/DAMO-NLP-SG/Video-LLaMA.git
+ARG VIDEO_LLAMA_COMMIT=0adb19e
+RUN tar -xvf video-llama.patch.tar && \
+    git clone ${VIDEO_LLAMA_REPO} Video-LLaMA && \
+    cd Video-LLaMA && git checkout ${VIDEO_LLAMA_COMMIT} && \
+    git apply --whitespace=fix ../video-llama.patch && \
+    mv video_llama ../ && \
+    cd ../ && rm -rf Video-LLaMA
+
+
+ENV PYTHONPATH=/home/user
+
+
+ENTRYPOINT ["bash", "start.sh"]
diff --git a/comps/lvms/video-llama/server/docker/docker_compose_vllama.yaml b/comps/lvms/video-llama/server/docker/docker_compose_vllama.yaml
@@ -0,0 +1,25 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3"
+services:
+  lvm-video-llama:
+    image: opea/video-llama-lvm-server:latest
+    container_name: video-llama-lvm-server
+    ports:
+      - "9009:9009"
+    ipc: host
+    environment:
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
+      llm_download: "True"
+    volumes:
+      - "/home/$USER/.cache:/home/user/.cache" # RECOMMENDED: use cache to avoid download
+      - video-llama-model:/home/user/model
+    restart: unless-stopped
+networks:
+  default:
+    driver: bridge
+volumes:
+  video-llama-model: