Skip to content

Commit

Permalink
Support Valley (#2921)
Browse files Browse the repository at this point in the history
support valley
  • Loading branch information
lxline authored Jan 15, 2025
1 parent 5c1f043 commit b925d28
Show file tree
Hide file tree
Showing 12 changed files with 235 additions and 5 deletions.
1 change: 1 addition & 0 deletions docs/source/Instruction/支持的模型和数据集.md
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,7 @@
|[LLM-Research/MolmoE-1B-0924](https://modelscope.cn/models/LLM-Research/MolmoE-1B-0924)|molmoe|molmo|transformers>=4.45|vision|[allenai/MolmoE-1B-0924](https://huggingface.co/allenai/MolmoE-1B-0924)|
|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b)|pixtral|pixtral|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
|[InfiniAI/Megrez-3B-Omni](https://modelscope.cn/models/InfiniAI/Megrez-3B-Omni)|megrez_omni|megrez_omni|-|vision, audio|[Infinigence/Megrez-3B-Omni](https://huggingface.co/Infinigence/Megrez-3B-Omni)|
|[bytedance-research/Valley-Eagle-7B](https://modelscope.cn/models/bytedance-research/Valley-Eagle-7B)|valley|valley|transformers>=4.42, av|vision|-|


## 数据集
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,7 @@ The table below introduces the models integrated with ms-swift:
|[LLM-Research/MolmoE-1B-0924](https://modelscope.cn/models/LLM-Research/MolmoE-1B-0924)|molmoe|molmo|transformers>=4.45|vision|[allenai/MolmoE-1B-0924](https://huggingface.co/allenai/MolmoE-1B-0924)|
|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b)|pixtral|pixtral|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
|[InfiniAI/Megrez-3B-Omni](https://modelscope.cn/models/InfiniAI/Megrez-3B-Omni)|megrez_omni|megrez_omni|-|vision, audio|[Infinigence/Megrez-3B-Omni](https://huggingface.co/Infinigence/Megrez-3B-Omni)|
|[bytedance-research/Valley-Eagle-7B](https://modelscope.cn/models/bytedance-research/Valley-Eagle-7B)|valley|valley|transformers>=4.42, av|vision|-|


## Datasets
Expand Down
1 change: 1 addition & 0 deletions swift/llm/model/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ class MLLMModelType:
molmoe = 'molmoe'
pixtral = 'pixtral'
megrez_omni = 'megrez_omni'
valley = 'valley'


class ModelType(LLMModelType, MLLMModelType, BertModelType, RMModelType):
Expand Down
2 changes: 1 addition & 1 deletion swift/llm/model/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from . import (baai, baichuan, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft,
minicpm, mistral, mllm, mplug, openbuddy, qwen, skywork, telechat, yi)
minicpm, mistral, mllm, mplug, openbuddy, qwen, skywork, telechat, valley, yi)
63 changes: 63 additions & 0 deletions swift/llm/model/model/valley.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import sys
from functools import partial, wraps
from typing import Any, Dict

from transformers import AutoConfig

from swift.llm import TemplateType
from ..constant import MLLMModelType
from ..model_arch import ModelArch
from ..register import (Model, ModelGroup, ModelMeta, get_model_tokenizer_multimodal,
get_model_tokenizer_with_flash_attn, register_model)
from ..utils import ModelInfo, git_clone_github, safe_snapshot_download


def get_model_tokenizer_valley(model_dir: str,
model_info: ModelInfo,
model_kwargs: Dict[str, Any],
load_model: bool = True,
**kwargs):
llm_model_type = kwargs.pop('llm_model_type')
local_repo_path = kwargs.get('local_repo_path')
if not local_repo_path:
repo_path = 'https://github.com/bytedance/Valley.git'
local_repo_path = git_clone_github(repo_path)
sys.path.append(os.path.join(local_repo_path))

if llm_model_type == 'valley':
from valley_eagle.model.language_model.valley_qwen2 import ValleyQwen2ForCausalLM, ValleyConfig
model_config = ValleyConfig.from_pretrained(model_dir)
model_config.mm_vision_tower = safe_snapshot_download('AI-ModelScope/siglip-so400m-patch14-384')
model_config.eagle_vision_tower = safe_snapshot_download('Qwen/Qwen2-VL-7B-Instruct')
automodel_class = ValleyQwen2ForCausalLM

kwargs['model_config'] = model_config
kwargs['automodel_class'] = automodel_class
model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, model_info, model_kwargs, load_model, **kwargs)
model.generation_config.repetition_penalty = 1.0 # Otherwise, Error. Same for original code.
if model is not None:
from transformers import AutoProcessor, SiglipImageProcessor
tokenizer.image_processor = SiglipImageProcessor.from_pretrained(model.config.mm_vision_tower)
tokenizer.qwen2vl_processor = AutoProcessor.from_pretrained(
model.config.eagle_vision_tower, max_pixels=1280 * 28 * 28)
tokenizer.image_processor.crop_size = tokenizer.image_processor.size['height']
return model, tokenizer


register_model(
ModelMeta(
MLLMModelType.valley,
[
ModelGroup([
Model('bytedance-research/Valley-Eagle-7B'),
], ),
],
TemplateType.valley,
partial(get_model_tokenizer_valley, llm_model_type='valley'),
architectures=['ValleyQwen2ForCausalLM'],
model_arch=ModelArch.valley,
requires=['transformers>=4.42', 'av'],
tags=['vision'],
))
1 change: 1 addition & 0 deletions swift/llm/model/model_arch.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class MLLMModelArch:
molmo = 'molmo'
emu3_chat = 'emu3_chat'
megrez_omni = 'megrez_omni'
valley = 'valley'


class ModelArch(LLMModelArch, MLLMModelArch):
Expand Down
1 change: 1 addition & 0 deletions swift/llm/template/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ class MLLMTemplateType:
florence = 'florence'
molmo = 'molmo'
megrez_omni = 'megrez_omni'
valley = 'valley'


class TemplateType(LLMTemplateType, MLLMTemplateType, RMTemplateType):
Expand Down
2 changes: 1 addition & 1 deletion swift/llm/template/template/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from . import (deepseek, emu3, gemma, glm, got_ocr, idefics3, internlm, internvl, llama, llava, llm, megrez, microsoft,
minicpm, molmo, mplug, openbuddy, pixtral, qwen, yi)
minicpm, molmo, mplug, openbuddy, pixtral, qwen, valley, yi)
139 changes: 139 additions & 0 deletions swift/llm/template/template/valley.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import io
from dataclasses import dataclass
from typing import Any, Dict, List, Literal, Optional

import torch
from PIL import Image

from ..base import Template
from ..constant import MLLMTemplateType
from ..register import register_template
from ..template_inputs import StdTemplateInputs
from ..utils import Context
from .utils import ChatmlTemplateMeta


@dataclass
class ValleyTemplateMeta(ChatmlTemplateMeta):
auto_add_bos: bool = False
default_system: Optional[str] = ('You are Valley, a large language and vision assistant trained by ByteDance.'
'You are able to understand the visual content or video that the user provides,'
' and assist the user with a variety of tasks using natural language.'
'Follow the instructions carefully and explain your answers in detail.')


class ValleyTemplate(Template):
skip_prompt = True
use_model = True

def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index,
inputs: StdTemplateInputs) -> List[Context]:
# assert media_type == 'image'
if media_type == 'video':
from ..vision_utils import load_video_valley
return self.replace_video2image(load_video_valley, inputs, lambda i: [[151665, -200, 151666]])
return [[151665, -200, 151666]]

def preprocess_images(self, image_binary_list):
from valley_eagle.util.mm_utils import process_anyres_image

def byte2image(byte_data):
return Image.open(io.BytesIO(byte_data))

images = []
for binary in image_binary_list:
if isinstance(binary, Image.Image):
images.append(binary.convert('RGB'))
elif isinstance(binary, bytes):
images.append(byte2image(binary))
else:
raise ValueError('unsupported type')
video_pad = []
for img in images:
if self.model.config.anyres:
image = process_anyres_image(img, self.tokenizer.image_processor, self.model.config.grid_pinpoints)
else:
image = self.tokenizer.image_processor(img, return_tensors='pt')['pixel_values'][0]
video_pad.append(image)

if not self.model.config.anyres:
video = torch.stack(video_pad, dim=0)
else:
video = [torch.stack(img, dim=0) for img in video_pad]
return video

def process_images(self, inputs, images_binary):
import re
from qwen_vl_utils import fetch_image

if inputs.messages[-1]['role'] == 'user':
text = inputs.messages[-1]['content']
elif len(inputs.messages) > 1 and inputs.messages[-2]['role'] == 'user':
text = inputs.messages[-2]['content']
else:
text = ''
video_images_tensor = self.preprocess_images(images_binary)
img_length = len(video_images_tensor)
video_images_tensor = [video_images_tensor]
if img_length:
images = [[item.to(self.model.device).to(self.model.dtype) for item in img] for img in video_images_tensor]

messages_qwen = []
image_list = []
if isinstance(images_binary[0], Image.Image):
images_pil = [img.convert('RGB') for img in images_binary]
elif isinstance(images_binary[0], bytes):
images_pil = [Image.open(io.BytesIO(img)).convert('RGB') for img in images_binary]
image_sizes = torch.tensor([[x.size for x in images_pil]])
for image_file in images_pil:
image = fetch_image({'image': image_file})
image_list.append(image)
messages_qwen.append({'role': 'user', 'content': [{'type': 'text', 'text': text}]})
messages_qwen.append({'role': 'assistant', 'content': [{'type': 'text', 'text': ''}]})
text = self.tokenizer.qwen2vl_processor.apply_chat_template(
messages_qwen[:-1], tokenize=False, add_generation_prompt=True)
text_segs = re.split('<image>', text)
text = '<|vision_start|><|image_pad|><|vision_end|>'.join(text_segs[:len(image_list) + 1]) + ''.join(
text_segs[len(image_list) + 1:])
data_dict_qwen2vl = self.tokenizer.qwen2vl_processor(
text=[text], images=image_list, padding=True, return_tensors='pt')
results = {}

results['images'] = images
results['image_sizes'] = image_sizes
results['pixel_values'] = data_dict_qwen2vl['pixel_values'].to(self.model.device)
results['image_grid_thw'] = data_dict_qwen2vl['image_grid_thw'].to(self.model.device)
return results

def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
encoded = super()._encode(inputs)
images = inputs.images or []
input_ids = encoded['input_ids']
labels = encoded['labels']
if images:
results = self.process_images(inputs, images)
encoded['images'] = results['images']
encoded['image_sizes'] = results['image_sizes']
encoded['pixel_values'] = results['pixel_values']
encoded['image_grid_thw'] = results['image_grid_thw']
encoded['input_ids'] = input_ids
encoded['labels'] = labels
return encoded

def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
res = super()._data_collator(batch, padding_to=padding_to)
if 'images' in batch[0]:
res['images'] = sum([b['images'] for b in batch if 'images' in b], start=[])
res['image_sizes'] = torch.concat([b['image_sizes'] for b in batch if 'image_sizes' in b], dim=0)
for media_type in ['image', 'video']:
grid_thw = [b[f'{media_type}_grid_thw'] for b in batch if b.get(f'{media_type}_grid_thw') is not None]
if grid_thw:
res[f'{media_type}_grid_thw'] = torch.concat(grid_thw)
return res


register_template(ValleyTemplateMeta(
MLLMTemplateType.valley,
template_cls=ValleyTemplate,
))
11 changes: 11 additions & 0 deletions swift/llm/template/vision_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,17 @@ def load_audio_qwen(audio_io: BytesIO, sampling_rate: int):
return librosa.load(audio_io, sr=sampling_rate)[0]


@load_file_decorator
def load_video_valley(video_io: BytesIO):
import decord
from torchvision import transforms
video_reader = decord.VideoReader(video_io)
decord.bridge.set_bridge('torch')
video = video_reader.get_batch(np.linspace(0, len(video_reader) - 1, 8).astype(np.int_)).byte()
images = [transforms.ToPILImage()(image.permute(2, 0, 1)).convert('RGB') for image in video]
return images


def normalize_bbox(objects: List[Dict[str, Any]], images: List[Image.Image], to_type: Literal['real', 'norm_1000',
'norm_1']) -> None:
"""Normalize bbox to needed.
Expand Down
10 changes: 8 additions & 2 deletions tests/test_align/test_template/test_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def test_minicpmv():
_infer_model(pt_engine)


def test_valley():
pt_engine = PtEngine('bytedance-research/Valley-Eagle-7B')
_infer_model(pt_engine)


if __name__ == '__main__':
from swift.llm import PtEngine, RequestConfig, get_template
from swift.utils import get_logger, seed_everything
Expand All @@ -98,5 +103,6 @@ def test_minicpmv():
# test_internvl2_5()
# test_xcomposer2_5()
# test_internvl2_5_mpo()
test_mplug3()
test_minicpmv()
# test_mplug3()
# test_minicpmv()
test_valley()
8 changes: 7 additions & 1 deletion tests/test_align/test_template/test_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,11 @@ def test_doc_owl2():
'more efficient and accurate OCR-free document understanding.')


def test_valley():
pt_engine = PtEngine('bytedance-research/Valley-Eagle-7B')
_infer_model(pt_engine)


if __name__ == '__main__':
from swift.llm import PtEngine, RequestConfig, get_template
from swift.utils import get_logger, seed_everything
Expand All @@ -344,7 +349,7 @@ def test_doc_owl2():
# test_internvl2_phi3()
# test_llava()
# test_ovis1_6()
test_ovis1_6_llama3()
# test_ovis1_6_llama3()
# test_yi_vl()
# test_deepseek_vl()
# test_deepseek_vl2()
Expand Down Expand Up @@ -372,3 +377,4 @@ def test_doc_owl2():
# test_molmo()
# test_molmoe()
# test_doc_owl2()
test_valley()

0 comments on commit b925d28

Please sign in to comment.