Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(feat): Novelai tokenizer re-implement || New LLM #75

Merged
merged 13 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,4 @@ cython_debug/
/playground/art_assert/
/playground/unpack/
/playground/boom-train/
/frontend/
26 changes: 24 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ The goal of this repository is to use Pydantic to build legitimate requests to a
- [x] tool.random_prompt
- [x] tool.paint_mask
- [x] tool.image_metadata
- [x] tokenizer
- [x] /ai/generate-image
- [x] /user/subscription
- [x] /user/login
Expand Down Expand Up @@ -85,7 +86,7 @@ from dotenv import load_dotenv
from pydantic import SecretStr

from novelai_python import APIError, LoginCredential
from novelai_python.sdk.ai.generate import TextLLMModel, LLM
from novelai_python.sdk.ai.generate import TextLLMModel, LLM, get_default_preset

load_dotenv()
username = os.getenv("NOVELAI_USER", None)
Expand All @@ -99,7 +100,13 @@ login_credential = LoginCredential(

async def chat(prompt: str):
try:
agent = LLM.build(prompt=prompt, model=TextLLMModel.Kayra)
model = TextLLMModel.ERATO # llama3
parameters = get_default_preset(model).parameters
agent = LLM.build(
prompt=prompt,
model=model,
parameters=None # Auto Select or get from preset
)
result = await agent.request(session=login_credential)
except APIError as e:
raise Exception(f"Error: {e.message}")
Expand All @@ -126,6 +133,21 @@ pip install novelai_python
python3 -m novelai_python.server -h '127.0.0.1' -p 7888
```

#### Tokenizer

```python
from novelai_python._enum import get_tokenizer_model, TextLLMModel
from novelai_python.tokenizer import NaiTokenizer

tokenizer_package = NaiTokenizer(get_tokenizer_model(TextLLMModel.ERATO))
t_text = "a fox jumped over the lazy dog"
encode_tokens = tokenizer_package.encode(t_text)
print(tokenizer_package.tokenize_text(t_text))
print(f"Tokenized text: {encode_tokens}")
print(tokenizer_package.decode(tokenizer_package.encode(t_text)))

```

## Acknowledgements 🙏

[BackEnd](https://api.novelai.net/docs)
Expand Down
176 changes: 108 additions & 68 deletions pdm.lock

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions playground/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,20 @@ async def chat(prompt="Hello"):
try:
agent = LLM.build(
prompt=prompt,
model=TextLLMModel.Kayra,
model=TextLLMModel.ERATO,
)
result = await agent.request(session=credential)
result = await agent.request(session=_login_credential)
except APIError as e:
logger.exception(e)
print(f"Error: {e.message}")
return None
except Exception as e:
logger.exception(e)
else:
print(f"Result: \n{result.text}")
print(f"Result:\n{result.text}")


loop = asyncio.get_event_loop()
loop.run_until_complete(chat())
loop = asyncio.new_event_loop()
loop.run_until_complete(chat(
prompt="a fox jumped over the lazy dog, and the dog barked at the fox. The fox ran away."
))
6 changes: 2 additions & 4 deletions playground/generate_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,12 @@ async def stream(prompt="Hello"):
try:
agent = LLMStream.build(
prompt=prompt,
model=TextLLMModel.Kayra,
model=TextLLMModel.ERATO,
)
_data = []
# 现在,你可以使用异步for循环来处理每一部分数据
generator = agent.request(session=credential)
generator = agent.request(session=_login_credential)
async for data in generator:
data: LLMStreamResp
print(data.text) # 或者做其他需要的处理
_data.append(data)
except APIError as e:
print(f"Error: {e.message}")
Expand Down
94 changes: 94 additions & 0 deletions playground/tokenizer/tokenizer_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import json
import os
import pathlib
import zlib
from typing import Dict, List, Optional

import requests
from json_repair import repair_json
from pydantic import BaseModel, model_validator
from tokenizers import Tokenizer, pre_tokenizers, Regex, decoders
from tokenizers.models import BPE

# https://novelai.net/tokenizer/compressed/llama3nai_tokenizer.def?v=2&static=true

model_name = "clip_tokenizer"
model_full_name = f"{model_name}.def"
url = f"https://novelai.net/tokenizer/compressed/{model_full_name}?v=2&static=true"
if not os.path.exists(model_full_name):
print(f"Downloading {model_full_name} from {url}")
response = requests.get(url)
response.raise_for_status()
# write down
with open(model_full_name, "wb") as f:
f.write(response.content)


class TokenizerSetting(BaseModel):
class TokenizerConfig(BaseModel):
splitRegex: str
maxEncodeChars: Optional[int] = None
maxNoWhitespaceChars: Optional[int] = None
ignoreMerges: Optional[bool] = False

config: TokenizerConfig
specialTokens: List[str]
vocab: Dict[str, int]
merges: list

@model_validator(mode="after")
def ensure(self):
self.merges = [tuple(merge) for merge in self.merges]
return self


# 读取和解压文件
file = pathlib.Path(__file__).parent.joinpath(model_full_name)
encoded_data = file.read_bytes()
decompress_obj = zlib.decompressobj(-zlib.MAX_WBITS)
decode = decompress_obj.decompress(encoded_data)

# 修复和解析 JSON
repaired_json = repair_json(decode.decode('utf-8'), return_objects=True)
json.dump(repaired_json, open(f"{model_name}.json", "w"), indent=2)
tokenizer_setting = TokenizerSetting.model_validate(repaired_json)

# 创建 tokenizer
tokenizer = Tokenizer(BPE(
vocab=tokenizer_setting.vocab,
merges=tokenizer_setting.merges,
ignore_merges=tokenizer_setting.config.ignoreMerges
))

# 设置特殊 tokens
tokenizer.add_special_tokens(tokenizer_setting.specialTokens)
print(tokenizer.token_to_id(" "))
if tokenizer_setting.config.maxEncodeChars:
tokenizer.enable_truncation(max_length=tokenizer_setting.config.maxEncodeChars)
# 设置 normalizer
# tokenizer.normalizer = normalizers.Sequence([])

# 设置 pre_tokenizer
pre_zus = [
pre_tokenizers.Split(
behavior="merged_with_next",
pattern=Regex(tokenizer_setting.config.splitRegex)
),
]
if tokenizer.token_to_id(" ") is None:
pre_zus.append(pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False))
pre_tokenizer = pre_tokenizers.Sequence(pre_zus)

tokenizer.pre_tokenizer = pre_tokenizer
tokenizer.decoder = decoders.ByteLevel()

# 使用 tokenizer
text = "Hello, World! This is a test."
encoded = tokenizer.encode(text, add_special_tokens=True)
print(f"Pre-tokenized text: {pre_tokenizer.pre_tokenize_str(text)}")
print(f"Encoded tokens: {encoded.tokens}")
print(f"Token IDs: {encoded.ids}")

# 解码
decoded = tokenizer.decode(encoded.ids)
print(f"Decoded text:{decoded}")
22 changes: 14 additions & 8 deletions playground/tokenizer/usage.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
from novelai_python.tokenizer import ImagePromptTokenizer
from novelai_python._enum import TextTokenizerGroup, get_tokenizer_model, TextLLMModel
from novelai_python.tokenizer import NaiTokenizer
from novelai_python.utils.encode import b64_to_tokens

tokenizer_util = ImagePromptTokenizer(ImagePromptTokenizer.MODEL_V2_PATH)
text = "The quick brown fox jumps over the goblin."
token_id = tokenizer_util.encode(text)
print("Token IDs:", token_id)
decoded_text = tokenizer_util.decode(token_id)
print("Decoded text:", decoded_text)
tokenizer_package = NaiTokenizer(get_tokenizer_model(TextLLMModel.ERATO))
t_text = "a fox jumped over the lazy dog"
encode_tokens = tokenizer_package.encode(t_text)
print(tokenizer_package.tokenize_text(t_text))
print(f"Tokenized text: {encode_tokens}")
print(tokenizer_package.decode(tokenizer_package.encode(t_text)))

b64 = "UfQBADoAAABIAQAAGQAAANwAAAATAAAAexQAAEAAAAD/mwAA2GkAAJ8DAAAXAQAAtT4AAC8WAAA="
oks = b64_to_tokens(b64)
print(oks)


def limit_prompt_shown(raw_text: str, token_limit=225):
assert isinstance(raw_text, str), "raw_text must be a string"
tokenizer = ImagePromptTokenizer(ImagePromptTokenizer.MODEL_V2_PATH)
tokenizer = NaiTokenizer(TextTokenizerGroup.NERDSTASH_V2)
token_array = tokenizer.encode(raw_text)
used_tokens_len = len(token_array)
if used_tokens_len > token_limit:
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "novelai-python"
version = "0.4.17"
version = "0.5.0"
description = "NovelAI Python Binding With Pydantic"
authors = [
{ name = "sudoskys", email = "[email protected]" },
Expand All @@ -26,6 +26,8 @@ dependencies = [
"ftfy>=6.2.0",
"regex>=2023.12.25",
"tokenizers>=0.15.2",
"json-repair>=0.29.4",
"robust-downloader>=0.0.2",
]
requires-python = ">=3.9"
readme = "README.md"
Expand Down
1 change: 0 additions & 1 deletion src/novelai_python/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# @Time : 2023/11/18 上午12:18
# @Author : sudoskys
# @File : __init__.py

from ._exceptions import (
NovelAiError,
APIError,
Expand Down
94 changes: 94 additions & 0 deletions src/novelai_python/_enum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from enum import Enum
from typing import Optional, Union


class TextLLMModel(Enum):
NEO_2B = "2.7B"
J_6B = "6B"
J_6B_V3 = "6B-v3"
J_6B_V4 = "6B-v4"
GENJI_PYTHON_6B = "genji-python-6b"
GENJI_JP_6B = "genji-jp-6b"
GENJI_JP_6B_V2 = "genji-jp-6b-v2"
EUTERPE_V0 = "euterpe-v0"
EUTERPE_V2 = "euterpe-v2"
KRAKE_V1 = "krake-v1"
KRAKE_V2 = "krake-v2"
BLUE = "blue"
RED = "red"
GREEN = "green"
PURPLE = "purple"
PINK = "pink"
YELLOW = "yellow"
WHITE = "white"
BLACK = "black"
CASSANDRA = "cassandra"
COMMENT_BOT = "hypebot"
INFILL = "infillmodel"
CLIO = "clio-v1"
KAYRA = "kayra-v1"
ERATO = "llama-3-erato-v1"


class TextTokenizerGroup(object):
GENJI = "genji_tokenizer.def"
PILE = "pile_tokenizer.def"
PILE_NAI = "pile_tokenizer.def"
NAI_INLINE = "gpt2_tokenizer.def"
NERDSTASH_V2 = "nerdstash_tokenizer_v2.def"
NERDSTASH = "nerdstash_tokenizer.def"
LLAMA3 = "llama3_tokenizer.def"
GPT2 = "gpt2_tokenizer.def"
CLIP = "clip_tokenizer.def"


TextLLMModelTypeAlias = Union[TextLLMModel, str]

TOKENIZER_MODEL_MAP = {
TextLLMModel.GENJI_JP_6B_V2: TextTokenizerGroup.GENJI,
TextLLMModel.CASSANDRA: TextTokenizerGroup.PILE,
TextLLMModel.KRAKE_V2: TextTokenizerGroup.PILE,
TextLLMModel.INFILL: TextTokenizerGroup.NAI_INLINE,
TextLLMModel.KAYRA: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.BLUE: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.PINK: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.YELLOW: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.RED: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.GREEN: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.BLACK: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.CLIO: TextTokenizerGroup.NERDSTASH,
TextLLMModel.PURPLE: TextTokenizerGroup.LLAMA3,
TextLLMModel.WHITE: TextTokenizerGroup.LLAMA3,
TextLLMModel.ERATO: TextTokenizerGroup.LLAMA3,
}

COLORS_LLM = [
TextLLMModel.BLUE,
TextLLMModel.RED,
TextLLMModel.GREEN,
TextLLMModel.PURPLE,
TextLLMModel.PINK,
TextLLMModel.YELLOW,
TextLLMModel.WHITE,
TextLLMModel.BLACK,
]


def get_llm_group(model: TextLLMModel) -> Optional[TextTokenizerGroup]:
if isinstance(model, str):
model = TextLLMModel(model)
return TOKENIZER_MODEL_MAP.get(model, None)


def get_tokenizer_model(model: TextLLMModel) -> str:
if isinstance(model, str):
model = TextLLMModel(model)
group = TOKENIZER_MODEL_MAP.get(model, TextTokenizerGroup.GPT2)
return group


def get_tokenizer_model_url(model: TextLLMModel) -> str:
model_name = get_tokenizer_model(model)
if not model_name.endswith(".def"):
model_name = f"{model_name}.def"
return f"https://novelai.net/tokenizer/compressed/{model_name}?v=2&static=true"
9 changes: 6 additions & 3 deletions src/novelai_python/_response/ai/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from pydantic import BaseModel, ConfigDict

from novelai_python.sdk.ai.generate._enum import TOKENIZER, TextLLMModel # noqa
from novelai_python.tokenizer import LLMTokenizer
from novelai_python._enum import get_tokenizer_model, TextLLMModel
from novelai_python.tokenizer import NaiTokenizer
from novelai_python.utils.encode import b64_to_tokens

if TYPE_CHECKING:
Expand All @@ -20,4 +20,7 @@ class LLMResp(BaseModel):

@staticmethod
def decode_token(token_str, model: TextLLMModel) -> str:
return LLMTokenizer().decode(b64_to_tokens(token_str), tokenizer_name=TOKENIZER.get(model))
dtype = 'uint32' if model in [TextLLMModel.ERATO] else 'uint16'
return NaiTokenizer(model=get_tokenizer_model(model)).decode(
b64_to_tokens(token_str, dtype=dtype)
)
9 changes: 6 additions & 3 deletions src/novelai_python/_response/ai/generate_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from pydantic import BaseModel, ConfigDict

from novelai_python.sdk.ai.generate._enum import TOKENIZER, TextLLMModel # noqa
from novelai_python.tokenizer import LLMTokenizer
from novelai_python._enum import get_tokenizer_model, TextLLMModel
from novelai_python.tokenizer import NaiTokenizer
from novelai_python.utils.encode import b64_to_tokens


Expand All @@ -20,4 +20,7 @@ class LLMStreamResp(BaseModel):

@staticmethod
def decode(token_str, model: TextLLMModel) -> str:
return LLMTokenizer().decode(b64_to_tokens(token_str), tokenizer_name=TOKENIZER.get(model))
dtype = 'uint32' if model in [TextLLMModel.ERATO] else 'uint16'
return NaiTokenizer(model=get_tokenizer_model(model)).decode(
b64_to_tokens(token_str, dtype=dtype)
)
Loading
Loading