Skip to content

Commit

Permalink
version(v1.3.2): release
Browse files Browse the repository at this point in the history
  • Loading branch information
DoodleBears committed Jul 7, 2024
1 parent 93b2f1b commit c97e5f7
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 40 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def read(*relpath):

setup(
name="split_lang",
version="1.3.1",
version="1.3.2",
description="A package for splitting text by languages through concatenating over split substrings based on their language",
long_description=read("README.md"),
long_description_content_type="text/markdown",
Expand Down
165 changes: 127 additions & 38 deletions split-lang-benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,32 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import Language Detection Package"
"## Import Language Detection Package"
]
},
{
"cell_type": "code",
"execution_count": 272,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import polyglot\n",
"import langdetect\n",
"import fast_langdetect\n",
"from lingua import Language, LanguageDetectorBuilder\n",
"\n",
"detector = LanguageDetectorBuilder.from_all_languages().build()"
"from lingua import Language, LanguageDetectorBuilder"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import Text Split Package"
"## Import Text Split Package"
]
},
{
"cell_type": "code",
"execution_count": 273,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\wtpsplit\\__init__.py:45: DeprecationWarning: You are using WtP, the old sentence segmentation model. It is highly encouraged to use SaT instead due to strongly improved performance and efficiency. See https://github.com/segment-any-text/wtpsplit for more info. To ignore this warning, set ignore_legacy_warning=True.\n",
" warnings.warn(\n",
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\sklearn\\base.py:376: InconsistentVersionWarning: Trying to unpickle estimator LogisticRegression from version 1.2.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
"https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
" warnings.warn(\n"
]
}
],
"outputs": [],
"source": [
"from wtpsplit import SaT, WtP\n",
"sat = SaT(\"sat-1l-sm\")\n",
Expand All @@ -55,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count": 274,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -64,14 +49,14 @@
},
{
"cell_type": "code",
"execution_count": 275,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 110.31it/s]\n"
"100%|██████████| 1/1 [00:00<00:00, 125.02it/s]\n"
]
},
{
Expand All @@ -80,7 +65,7 @@
"['你', '喜欢看', 'アニメ', '吗']"
]
},
"execution_count": 275,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -91,7 +76,7 @@
},
{
"cell_type": "code",
"execution_count": 290,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -176,9 +161,16 @@
"texts = texts_zh_jp_ko_en + texts_de_fr_en + texts_with_digit"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Simple Rule-Based judge"
]
},
{
"cell_type": "code",
"execution_count": 291,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -211,15 +203,42 @@
" return False"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Try different split logic"
]
},
{
"cell_type": "code",
"execution_count": 292,
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"jp_budoux_parser----------\n",
"['你好', '今日は', 'どこへ', '行きますか']\n",
"['我的名字是田中さんです']\n",
"['我喜欢吃寿司和拉面', 'おいしいです']\n",
"['今天の', '天気は', 'とても', 'いいですね']\n",
"['我在学习日本語少し', '難しいです']\n",
"['日语真是おもしろい', '啊']\n",
"['你喜欢看アニメ吗']\n",
"['我想去日本旅行特に', '京都に', '行きたいです']\n",
"['昨天見た', '映画は', 'とても', '感動的でした']\n",
"['我朋友是日本人', '彼は', 'とても', '優しいです']\n",
"['我们一起去カラオケ吧']\n",
"['我的家在北京でも', '仕事で', '東京に', '住んでいます']\n",
"['我的名字是西野くまです']\n",
"['我的名字是西野くまですよろしく', 'お願い', 'いたします']\n",
"['好吃美味しい', '上手い']\n",
"['我给你送的手紙']\n",
"['真是面白い']\n",
"['春の', '花香り']\n",
"['何ヶ国語話せますか']\n",
"----------jp_budoux_parser\n",
"[['你好', '今日は', 'どこへ', '行きますか'], ['我的名字是田中さんです'], ['我喜欢吃寿司和拉面', 'おいしいです'], ['今天の', '天気は', 'とても', 'いいですね'], ['我在学习日本語少し', '難しいです'], ['日语真是おもしろい', '啊'], ['你喜欢看アニメ吗'], ['我想去日本旅行特に', '京都に', '行きたいです'], ['昨天見た', '映画は', 'とても', '感動的でした'], ['我朋友是日本人', '彼は', 'とても', '優しいです'], ['我们一起去カラオケ吧'], ['我的家在北京でも', '仕事で', '東京に', '住んでいます'], ['我的名字是西野くまです'], ['我的名字是西野くまですよろしく', 'お願い', 'いたします'], ['好吃美味しい', '上手い'], ['我给你送的手紙'], ['真是面白い'], ['春の', '花香り'], ['何ヶ国語話せますか']]\n",
"----------jp_budoux_parser+zh_budoux_parser\n",
Expand Down Expand Up @@ -283,15 +302,11 @@
"# for text in texts_zh_jp:\n",
"# print(zh_tc_budoux_parser.parse(text))\n",
" \n",
"# print(\"jp_budoux_parser----------\")\n",
"# for text in texts_zh_jp:\n",
"# print(jp_budoux_parser.parse(text))\n",
"\n",
"print(\"jp_budoux_parser----------\")\n",
"for text in texts_zh_jp:\n",
" print(jp_budoux_parser.parse(text))\n",
"\n",
"\n",
"# print(\"----------wtp\")\n",
"# for text in texts_zh_jp:\n",
"# print(wtp.split(text_or_texts=text, threshold=5e-4, verbose=False))\n",
"print(\"----------jp_budoux_parser\")\n",
"\n",
"splitted_texts_jp = []\n",
Expand Down Expand Up @@ -341,17 +356,21 @@
},
{
"cell_type": "code",
"execution_count": 293,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from lingua import Language, LanguageDetectorBuilder\n",
"all_detector = LanguageDetectorBuilder.from_all_languages().build()"
"all_detector = (\n",
" LanguageDetectorBuilder.from_all_languages()\n",
" .with_preloaded_language_models()\n",
" .build()\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 294,
"execution_count": 22,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -414,6 +433,76 @@
" print(f\"{lang}:{substring}\",end='|')\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Difference between split by ` ` (space) and by `wtpsplit`"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\wtpsplit\\__init__.py:45: DeprecationWarning: You are using WtP, the old sentence segmentation model. It is highly encouraged to use SaT instead due to strongly improved performance and efficiency. See https://github.com/segment-any-text/wtpsplit for more info. To ignore this warning, set ignore_legacy_warning=True.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Vielen ', 'Dank ', 'merci ', 'beaucoup ', 'for your help.']\n",
"de:Vielen |nl:Dank |fr:merci |fr:beaucoup |en:for your help.|\n",
"0.5394351482391357\n",
"de:Vielen|nl:Dank|fr:merci|fr:beaucoup|en:for|en:your|en:help.|\n",
"0.0009989738464355469\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\sklearn\\base.py:376: InconsistentVersionWarning: Trying to unpickle estimator LogisticRegression from version 1.2.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
"https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
" warnings.warn(\n"
]
}
],
"source": [
"from wtpsplit import WtP\n",
"from time import time\n",
"from datetime import datetime\n",
"text = \"Vielen Dank merci beaucoup for your help.\"\n",
"\n",
"time1 = datetime.now().timestamp()\n",
"wtp = WtP('wtp-bert-mini')\n",
"substrings = wtp.split(text_or_texts=text, threshold=1e-4)\n",
"print(substrings)\n",
"for substring in substrings:\n",
" # lang = lingua_lang_detect_all(substring)\n",
" lang = fast_lang_detect(substring)\n",
" print(f\"{lang}:{substring}\",end='|')\n",
"print()\n",
"time2 = datetime.now().timestamp()\n",
"\n",
"print(time2 - time1)\n",
"\n",
"substrings = text.split(' ')\n",
"for substring in substrings:\n",
" # lang = lingua_lang_detect_all(substring)\n",
" lang = fast_lang_detect(substring)\n",
" print(f\"{lang}:{substring}\",end='|')\n",
"time3 = datetime.now().timestamp()\n",
"print()\n",
"print(time3 - time2)"
]
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion split-lang-demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"outputs": [],
"source": [
"%%capture\n",
"%pip install split-lang==1.3.1"
"%pip install split-lang==1.3.2"
]
},
{
Expand Down

0 comments on commit c97e5f7

Please sign in to comment.