version(v1.3.2): release

DoodleBears · Jul 7, 2024 · c97e5f7 · c97e5f7
1 parent 93b2f1b
commit c97e5f7
Show file tree

Hide file tree

Showing 3 changed files with 129 additions and 40 deletions.
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@ def read(*relpath):
 
 setup(
     name="split_lang",
-    version="1.3.1",
+    version="1.3.2",
     description="A package for splitting text by languages through concatenating over split substrings based on their language",
     long_description=read("README.md"),
     long_description_content_type="text/markdown",

diff --git a/split-lang-benchmark.ipynb b/split-lang-benchmark.ipynb
@@ -4,47 +4,32 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Import Language Detection Package"
+    "## Import Language Detection Package"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 272,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import polyglot\n",
     "import langdetect\n",
     "import fast_langdetect\n",
-    "from lingua import Language, LanguageDetectorBuilder\n",
-    "\n",
-    "detector = LanguageDetectorBuilder.from_all_languages().build()"
+    "from lingua import Language, LanguageDetectorBuilder"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Import Text Split Package"
+    "## Import Text Split Package"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 273,
+   "execution_count": 15,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\wtpsplit\\__init__.py:45: DeprecationWarning: You are using WtP, the old sentence segmentation model. It is highly encouraged to use SaT instead due to strongly improved performance and efficiency. See https://github.com/segment-any-text/wtpsplit for more info. To ignore this warning, set ignore_legacy_warning=True.\n",
-      "  warnings.warn(\n",
-      "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\sklearn\\base.py:376: InconsistentVersionWarning: Trying to unpickle estimator LogisticRegression from version 1.2.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
-      "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from wtpsplit import SaT, WtP\n",
     "sat = SaT(\"sat-1l-sm\")\n",
@@ -55,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 274,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -64,14 +49,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 275,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00, 110.31it/s]\n"
+      "100%|██████████| 1/1 [00:00<00:00, 125.02it/s]\n"
      ]
     },
     {
@@ -80,7 +65,7 @@
        "['你', '喜欢看', 'アニメ', '吗']"
       ]
      },
-     "execution_count": 275,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -91,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 290,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -176,9 +161,16 @@
     "texts = texts_zh_jp_ko_en + texts_de_fr_en + texts_with_digit"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simple Rule-Based judge"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 291,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -211,15 +203,42 @@
     "    return False"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Try different split logic"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 292,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "jp_budoux_parser----------\n",
+      "['你好', '今日は', 'どこへ', '行きますか']\n",
+      "['我的名字是田中さんです']\n",
+      "['我喜欢吃寿司和拉面', 'おいしいです']\n",
+      "['今天の', '天気は', 'とても', 'いいですね']\n",
+      "['我在学习日本語少し', '難しいです']\n",
+      "['日语真是おもしろい', '啊']\n",
+      "['你喜欢看アニメ吗']\n",
+      "['我想去日本旅行特に', '京都に', '行きたいです']\n",
+      "['昨天見た', '映画は', 'とても', '感動的でした']\n",
+      "['我朋友是日本人', '彼は', 'とても', '優しいです']\n",
+      "['我们一起去カラオケ吧']\n",
+      "['我的家在北京でも', '仕事で', '東京に', '住んでいます']\n",
+      "['我的名字是西野くまです']\n",
+      "['我的名字是西野くまですよろしく', 'お願い', 'いたします']\n",
+      "['好吃美味しい', '上手い']\n",
+      "['我给你送的手紙']\n",
+      "['真是面白い']\n",
+      "['春の', '花香り']\n",
+      "['何ヶ国語話せますか']\n",
       "----------jp_budoux_parser\n",
       "[['你好', '今日は', 'どこへ', '行きますか'], ['我的名字是田中さんです'], ['我喜欢吃寿司和拉面', 'おいしいです'], ['今天の', '天気は', 'とても', 'いいですね'], ['我在学习日本語少し', '難しいです'], ['日语真是おもしろい', '啊'], ['你喜欢看アニメ吗'], ['我想去日本旅行特に', '京都に', '行きたいです'], ['昨天見た', '映画は', 'とても', '感動的でした'], ['我朋友是日本人', '彼は', 'とても', '優しいです'], ['我们一起去カラオケ吧'], ['我的家在北京でも', '仕事で', '東京に', '住んでいます'], ['我的名字是西野くまです'], ['我的名字是西野くまですよろしく', 'お願い', 'いたします'], ['好吃美味しい', '上手い'], ['我给你送的手紙'], ['真是面白い'], ['春の', '花香り'], ['何ヶ国語話せますか']]\n",
       "----------jp_budoux_parser+zh_budoux_parser\n",
@@ -283,15 +302,11 @@
     "# for text in texts_zh_jp:\n",
     "#     print(zh_tc_budoux_parser.parse(text))\n",
     "    \n",
-    "# print(\"jp_budoux_parser----------\")\n",
-    "# for text in texts_zh_jp:\n",
-    "#     print(jp_budoux_parser.parse(text))\n",
-    "\n",
+    "print(\"jp_budoux_parser----------\")\n",
+    "for text in texts_zh_jp:\n",
+    "    print(jp_budoux_parser.parse(text))\n",
     "\n",
     "\n",
-    "# print(\"----------wtp\")\n",
-    "# for text in texts_zh_jp:\n",
-    "#     print(wtp.split(text_or_texts=text, threshold=5e-4, verbose=False))\n",
     "print(\"----------jp_budoux_parser\")\n",
     "\n",
     "splitted_texts_jp = []\n",
@@ -341,17 +356,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 293,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
     "from lingua import Language, LanguageDetectorBuilder\n",
-    "all_detector = LanguageDetectorBuilder.from_all_languages().build()"
+    "all_detector = (\n",
+    "    LanguageDetectorBuilder.from_all_languages()\n",
+    "    .with_preloaded_language_models()\n",
+    "    .build()\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 294,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -414,6 +433,76 @@
     "        print(f\"{lang}:{substring}\",end='|')\n",
     "    print()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Difference between split by `  ` (space) and by `wtpsplit`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\wtpsplit\\__init__.py:45: DeprecationWarning: You are using WtP, the old sentence segmentation model. It is highly encouraged to use SaT instead due to strongly improved performance and efficiency. See https://github.com/segment-any-text/wtpsplit for more info. To ignore this warning, set ignore_legacy_warning=True.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Vielen ', 'Dank ', 'merci ', 'beaucoup ', 'for your help.']\n",
+      "de:Vielen |nl:Dank |fr:merci |fr:beaucoup |en:for your help.|\n",
+      "0.5394351482391357\n",
+      "de:Vielen|nl:Dank|fr:merci|fr:beaucoup|en:for|en:your|en:help.|\n",
+      "0.0009989738464355469\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\sklearn\\base.py:376: InconsistentVersionWarning: Trying to unpickle estimator LogisticRegression from version 1.2.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
+      "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from wtpsplit import WtP\n",
+    "from time import time\n",
+    "from datetime import datetime\n",
+    "text = \"Vielen Dank merci beaucoup for your help.\"\n",
+    "\n",
+    "time1 = datetime.now().timestamp()\n",
+    "wtp = WtP('wtp-bert-mini')\n",
+    "substrings = wtp.split(text_or_texts=text, threshold=1e-4)\n",
+    "print(substrings)\n",
+    "for substring in substrings:\n",
+    "    # lang = lingua_lang_detect_all(substring)\n",
+    "    lang = fast_lang_detect(substring)\n",
+    "    print(f\"{lang}:{substring}\",end='|')\n",
+    "print()\n",
+    "time2 = datetime.now().timestamp()\n",
+    "\n",
+    "print(time2 - time1)\n",
+    "\n",
+    "substrings = text.split(' ')\n",
+    "for substring in substrings:\n",
+    "    # lang = lingua_lang_detect_all(substring)\n",
+    "    lang = fast_lang_detect(substring)\n",
+    "    print(f\"{lang}:{substring}\",end='|')\n",
+    "time3 = datetime.now().timestamp()\n",
+    "print()\n",
+    "print(time3 - time2)"
+   ]
   }
  ],
  "metadata": {

diff --git a/split-lang-demo.ipynb b/split-lang-demo.ipynb
@@ -7,7 +7,7 @@
    "outputs": [],
    "source": [
     "%%capture\n",
-    "%pip install split-lang==1.3.1"
+    "%pip install split-lang==1.3.2"
    ]
   },
   {