diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..7aae70d --- /dev/null +++ b/.coveragerc @@ -0,0 +1,10 @@ +[report] +exclude_lines = + pragma: no cover + def __repr__ + if self.debug: + if settings.DEBUG + raise AssertionError + raise NotImplementedError + if 0: + if __name__ == .__main__.: diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..a8d130a --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,50 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.5, 3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pip install pytest + pip install pytest-cov + pytest --cov=pysbd tests/ --color yes --cov-report=xml --cov-report=html + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1.0.7 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml + flags: unittests + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: true diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 1331ca5..0000000 --- a/.travis.yml +++ /dev/null @@ -1,12 +0,0 @@ -language: python -python: - - "3.6" - -# command to install dependencies -install: - - "pip install pipenv --upgrade-strategy=only-if-needed" - - "pipenv install --dev" - -# command to run the dependencies -script: - - "pytest" diff --git a/CHANGELOG.md b/CHANGELOG.md index 64bf68a..6ddeaaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,46 +1,48 @@ -# v0.1.0 - -- Initial Release - -# v0.1.1 - -- English language support only -- Support for oother languages - WIP - -# v0.1.2 +# v0.3.0rc +- ✨ 💫 sent `char_span` through with spaCy & regex approach - \#63 +- ♻️ Refactoring to support multiple languages +- ✨ 💫Initial language support for - Hindi, Marathi, Chinese, Spanish +- ✅ Updated tests - more coverage & regression tests for issues +- 👷👷🏻‍♀️ GitHub actions for CI-CD +- 💚☂️ Add code coverage - coverage.py Add Codecov +- 🐛 Fix incorrect text span & vanilla pysbd vs spacy output discrepancy - \#49, \#53, \#55 , \#59 +- 🐛 Fix `NUMBERED_REFERENCE_REGEX` for zero or one time - \#58 +- 🔐Fix security vulnerability bleach - \#62 -- 🐛BugFix - IndexError of `scanlists` function -# v0.1.3 - -- 🐛 Fix `lists_item_replacer` - \#29 -- 🐛 Fix & ♻️refactor `replace_multi_period_abbreviations` - \#30 -- 🐛 Fix `abbreviation_replacer` - \#31 -- ✅ Add regression tests for issues - -# v0.1.4 - -- ✨ ✅ Handle intermittent punctuations - \#34 +# v0.2.3 +- 🐛 Performance improvement in `abbreviation_replacer`- \#50 -# v0.1.5 +# v0.2.2 +- 🐛 Fix unbalanced parenthesis - \#47 -- 🐛 Handle text with only punctuations - \#36 -- 🐛 Handle exclamation marks at EOL- \#37 +# v0.2.1 +- ✨pySBD as a spaCy component through entrypoints # v0.2.0 - - ✨Add `char_span` parameter (optional) to get sentence & its (start, end) char offsets from original text - ✨pySBD as a spaCy component example - 🐛 Fix double question mark swallow bug - \#39 -# v0.2.1 +# v0.1.5 +- 🐛 Handle text with only punctuations - \#36 +- 🐛 Handle exclamation marks at EOL- \#37 -- ✨pySBD as a spaCy component through entrypoints +# v0.1.4 +- ✨ ✅ Handle intermittent punctuations - \#34 -# v0.2.2 +# v0.1.3 +- 🐛 Fix `lists_item_replacer` - \#29 +- 🐛 Fix & ♻️refactor `replace_multi_period_abbreviations` - \#30 +- 🐛 Fix `abbreviation_replacer` - \#31 +- ✅ Add regression tests for issues -- 🐛 Fix unbalanced parenthesis - \#47 +# v0.1.2 +- 🐛BugFix - IndexError of `scanlists` function -# v0.2.3 +# v0.1.1 +- English language support only +- Support for oother languages - WIP -- 🐛 Performance improvement in `abbreviation_replacer`- \#50 +# v0.1.0 +- Initial Release diff --git a/Pipfile b/Pipfile index c68775c..57b5433 100644 --- a/Pipfile +++ b/Pipfile @@ -4,14 +4,14 @@ url = "https://pypi.org/simple" verify_ssl = true [dev-packages] -yapf = "*" ipython = "*" ipdb = "*" twine = "*" +pytest = "*" +pytest-cov = "*" [packages] -pytest = "*" -autopep8 = "*" +spacy = "*" [requires] python_version = "3.6" diff --git a/Pipfile.lock b/Pipfile.lock index dba377f..eaae333 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "75ed38fca92ae22d4a12464439e8bcc2c4f69b21bc784927da7bc1fe936a1fff" + "sha256": "a706ba0ffb723d164bfa0863ae3f42f86ee9a06d3d4db9366e4655011591cc87" }, "pipfile-spec": 6, "requires": { @@ -16,105 +16,239 @@ ] }, "default": { - "atomicwrites": { - "hashes": [ - "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", - "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" - ], - "version": "==1.3.0" - }, - "attrs": { - "hashes": [ - "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", - "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" - ], - "version": "==19.1.0" - }, - "autopep8": { - "hashes": [ - "sha256:4d8eec30cc81bc5617dbf1218201d770dc35629363547f17577c61683ccfb3ee" - ], - "index": "pypi", - "version": "==1.4.4" + "blis": { + "hashes": [ + "sha256:00473602629ba69fe6565108e21957e918cb48b59f5bf2f6bfb6e04de42500cb", + "sha256:03c368c9716ca814c436550a5f1e02ccf74850e613602519e3941d212e5aa177", + "sha256:135450caabc8aea9bb9250329ebdf7189982d9b57d5c92789b2ba2fe52c247a7", + "sha256:1402d9cbb0fbc21b749dd5b87d7ee14249e74a0ca38be6ecc56b3b356fca2f21", + "sha256:26b16d6005bb2671699831b5cc699905215d1abde1ec5c1d04de7dcd9eb29f75", + "sha256:3347a4b1b7d3ae14476aac9a6f7bf8ebf464863f4ebf4aea228874a7694ea240", + "sha256:38fe877a4b52e762f5e137a412e3c256545a696a12ae8c40d67b8815d2bb5097", + "sha256:4fb89c47ee06b58a4410a16fd5794847517262c9d2a342643475b477dfeff0a4", + "sha256:77a6486b9794af01bcdfd1bc6e067c93add4b93292e6f95bf6e5ce7f98bf0163", + "sha256:856142a11e37fd2c47c5006a3197e157bb8469a491a73d2d442223dd3279df84", + "sha256:8aeaf6954351593a1e412f80e398aa51df588d3c0de74b9f3323b694c603381b", + "sha256:9ede123065f3cacb109967755b3d83d4ca0de90643a9058129a6ab2d4051954f", + "sha256:d1d59faebc1c94f8f4f77154ef4b9d6d40364b111cf8fde48ee3b524c85f1075", + "sha256:d69257d317e86f34a7f230a2fd1f021fd2a1b944137f40d8cdbb23bd334cd0c4", + "sha256:ddd732c5274d1082fa92e2c42317587d5ebabce7741ca98120f69bd45d004b99", + "sha256:f0b0dad4d6268d9dba0a65a9db12dd7a2d8686b648399e4aa1aec7550697e99e" + ], + "version": "==0.4.1" + }, + "catalogue": { + "hashes": [ + "sha256:584d78e7f4c3c6e2fd498eb56dfc8ef1f4ff738480237de2ccd26cbe2cf47172", + "sha256:d74d1d856c6b36a37bf14aa6dbbc27d0582667b7ab979a6108e61a575e8723f5" + ], + "version": "==1.0.0" }, - "importlib-metadata": { + "certifi": { "hashes": [ - "sha256:a9f185022cfa69e9ca5f7eabfd5a58b689894cb78a11e3c8c89398a8ccbb8e7f", - "sha256:df1403cd3aebeb2b1dcd3515ca062eecb5bd3ea7611f18cba81130c68707e879" + "sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304", + "sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519" ], - "version": "==0.17" + "version": "==2020.4.5.1" }, - "more-itertools": { + "chardet": { "hashes": [ - "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7", - "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a" + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" ], - "markers": "python_version > '2.7'", - "version": "==7.0.0" + "version": "==3.0.4" }, - "packaging": { + "cymem": { "hashes": [ - "sha256:0c98a5d0be38ed775798ece1b9727178c4469d9c3b4ada66e8e6b7849f8732af", - "sha256:9e1cbf8c12b1f1ce0bb5344b8d7ecf66a6f8a6e91bcb0c84593ed6d3ab5c4ab3" + "sha256:5083b2ab5fe13ced094a82e0df465e2dbbd9b1c013288888035e24fd6eb4ed01", + "sha256:622c20a57701d02f01a47e856dea248e112638f28c8249dbe3ed95a9702e3d74", + "sha256:6f4cb689a9552e9e13dccc89203c8ab09f210a7ffb92ce27c384a4a0be27b527", + "sha256:719f04a11ca709fc2b47868070d79fccff77e5d502ff32de2f4baa73cb16166f", + "sha256:7236252bed70f37b898933dcf8aa875d0829664a245a272516f27b30439df71c", + "sha256:7f5ddceb12b73f7fd2e4398266401b6f887003740ccd18c989a2af04500b5f2b", + "sha256:85b9364e099426bd7f445a7705aad87bf6dbb71d79e3802dd8ca14e181d38a33", + "sha256:c288a1bbdf58c360457443e5297e74844e1961e5e7001dbcb3a5297a41911a11", + "sha256:cd21ec48ee70878d46c486e2f7ae94b32bfc6b37c4d27876c5a5a00c4eb75c3c", + "sha256:d7505c500d994f11662e5595f5002251f572acc189f18944619352e2636f5181", + "sha256:dd24848fbd75b17bab06408da6c029ba7cc615bd9e4a1f755fb3a090025fb922", + "sha256:f4f19af4bca81f11922508a9dcf30ce1d2aee4972af9f81ce8e5331a6f46f5e1" ], - "version": "==19.0" + "version": "==2.0.3" }, - "pluggy": { + "idna": { "hashes": [ - "sha256:0825a152ac059776623854c1543d65a4ad408eb3d33ee114dff91e57ec6ae6fc", - "sha256:b9817417e95936bf75d85d3f8767f7df6cdde751fc40aed3bb3074cbcb77757c" + "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb", + "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa" ], - "version": "==0.12.0" + "version": "==2.9" }, - "py": { + "importlib-metadata": { "hashes": [ - "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", - "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" - ], - "version": "==1.8.0" + "sha256:0505dd08068cfec00f53a74a0ad927676d7757da81b7436a6eefe4c7cf75c545", + "sha256:15ec6c0fd909e893e3a08b3a7c76ecb149122fb14b7efe1199ddd4c7c57ea958" + ], + "markers": "python_version < '3.8'", + "version": "==1.6.1" + }, + "murmurhash": { + "hashes": [ + "sha256:27b908fe4bdb426f4e4e4a8821acbe0302915b2945e035ec9d8ca513e2a74b1f", + "sha256:33405103fa8cde15d72ee525a03d5cfe2c7e4901133819754810986e29627d68", + "sha256:386a9eed3cb27cb2cd4394b6521275ba04552642c2d9cab5c9fb42aa5a3325c0", + "sha256:3af36a0dc9f13f6892d9b8b39a6a3ccf216cae5bce38adc7c2d145677987772f", + "sha256:717196a04cdc80cc3103a3da17b2415a8a5e1d0d578b7079259386bf153b3258", + "sha256:8a4ed95cd3456b43ea301679c7c39ade43fc18b844b37d0ba0ac0d6acbff8e0c", + "sha256:8b045a79e8b621b4b35b29f29e33e9e0964f3a276f7da4d5736142f322ad4842", + "sha256:a6c071b4b498bcea16a8dc8590cad81fa8d43821f34c74bc00f96499e2527073", + "sha256:b0afe329701b59d02e56bc6cee7325af83e3fee9c299c615fc1df3202b4f886f", + "sha256:ba766343bdbcb928039b8fff609e80ae7a5fd5ed7a4fc5af822224b63e0cbaff", + "sha256:bf33490514d308bcc27ed240cb3eb114f1ec31af031535cd8f27659a7049bd52", + "sha256:c7a646f6b07b033642b4f52ae2e45efd8b80780b3b90e8092a0cec935fbf81e2", + "sha256:cc97ea766ac545074bab0e5af3dbc48e0d05ba230ae5a404e284d39abe4b3baf", + "sha256:d696c394ebd164ca80b5871e2e9ad2f9fdbb81bd3c552c1d5f1e8ee694e6204a", + "sha256:f468e4868f78c3ac202a66abfe2866414bca4ae7666a21ef0938c423de0f7d50", + "sha256:fe344face8d30a5a6aa26e5acf288aa2a8f0f32e05efdda3d314b4bf289ec2af" + ], + "version": "==1.0.2" + }, + "numpy": { + "hashes": [ + "sha256:0172304e7d8d40e9e49553901903dc5f5a49a703363ed756796f5808a06fc233", + "sha256:34e96e9dae65c4839bd80012023aadd6ee2ccb73ce7fdf3074c62f301e63120b", + "sha256:3676abe3d621fc467c4c1469ee11e395c82b2d6b5463a9454e37fe9da07cd0d7", + "sha256:3dd6823d3e04b5f223e3e265b4a1eae15f104f4366edd409e5a5e413a98f911f", + "sha256:4064f53d4cce69e9ac613256dc2162e56f20a4e2d2086b1956dd2fcf77b7fac5", + "sha256:4674f7d27a6c1c52a4d1aa5f0881f1eff840d2206989bae6acb1c7668c02ebfb", + "sha256:7d42ab8cedd175b5ebcb39b5208b25ba104842489ed59fbb29356f671ac93583", + "sha256:965df25449305092b23d5145b9bdaeb0149b6e41a77a7d728b1644b3c99277c1", + "sha256:9c9d6531bc1886454f44aa8f809268bc481295cf9740827254f53c30104f074a", + "sha256:a78e438db8ec26d5d9d0e584b27ef25c7afa5a182d1bf4d05e313d2d6d515271", + "sha256:a7acefddf994af1aeba05bbbafe4ba983a187079f125146dc5859e6d817df824", + "sha256:a87f59508c2b7ceb8631c20630118cc546f1f815e034193dc72390db038a5cb3", + "sha256:ac792b385d81151bae2a5a8adb2b88261ceb4976dbfaaad9ce3a200e036753dc", + "sha256:b03b2c0badeb606d1232e5f78852c102c0a7989d3a534b3129e7856a52f3d161", + "sha256:b39321f1a74d1f9183bf1638a745b4fd6fe80efbb1f6b32b932a588b4bc7695f", + "sha256:cae14a01a159b1ed91a324722d746523ec757357260c6804d11d6147a9e53e3f", + "sha256:cd49930af1d1e49a812d987c2620ee63965b619257bd76eaaa95870ca08837cf", + "sha256:e15b382603c58f24265c9c931c9a45eebf44fe2e6b4eaedbb0d025ab3255228b", + "sha256:e91d31b34fc7c2c8f756b4e902f901f856ae53a93399368d9a0dc7be17ed2ca0", + "sha256:ef627986941b5edd1ed74ba89ca43196ed197f1a206a3f18cc9faf2fb84fd675", + "sha256:f718a7949d1c4f622ff548c572e0c03440b49b9531ff00e4ed5738b459f011e8" + ], + "version": "==1.18.5" + }, + "plac": { + "hashes": [ + "sha256:398cb947c60c4c25e275e1f1dadf027e7096858fb260b8ece3b33bcff90d985f", + "sha256:487e553017d419f35add346c4c09707e52fa53f7e7181ce1098ca27620e9ceee" + ], + "version": "==1.1.3" + }, + "preshed": { + "hashes": [ + "sha256:0c15ae62f2595ca479decc3452967484dae57b510278800f5deb9115238cc818", + "sha256:190345724eb3f7aeaeb2a758740d698bd6c017c2cdf07c71c16b34820973d114", + "sha256:1be3cb59211282e906a11443464fe3e19f6561e2fcd06410e4adc6d45354cf82", + "sha256:1ef72a120e49356058b3c0590d7b5e91f2747b44e006eef6579be6131223cab0", + "sha256:253970beae87ab672a6afb543908761795eea3cb7b0d784e2ea51e265752059e", + "sha256:448d9df12e63fe4a3024f6153ee6703bb95d2be0ce887b5eda7ddc41acfba825", + "sha256:61d73468c97c1d6d5a048de0b01d5a6fd052123358aca4823cdb277e436436cb", + "sha256:633358f1fb0ec5dd6dbe4971c328d08809e5a8dbefdf13a802ae0a7cb45306c7", + "sha256:6518bbd5fb8adbc3231e75ae78d96a7bdd5405a3b23a09d5e62a2e4fc833724e", + "sha256:7e80ffc1fb79496d4feafe0eaf71ee5e532b91daf6cec235d7f9c4c12657a58c", + "sha256:7ea588a78aaf310ae2c293071a8571b07ae434819be05fe510442b6df3f8fbf7", + "sha256:88427346b220293439db77c82913791fa13edc6ac73d8159610699a3ca17aae9", + "sha256:8a9a8222a697a513f25a94733e7a17cc298ecd8fd56b606a1d8fa0ac342c2830", + "sha256:b4ae6c7c44aa3ff7bd717791bb6b619ecb273b7cb128c986f2dc65f6e0e6ddd4", + "sha256:e37058d91bd7f0f5a7a9c83d22a83dc581ab5f79688a87be81f200993145a250", + "sha256:ece5e850f667eaa3367d5c56dda9e3aa6ac1c0bb2117d2f466a26db5f26bbe4b" + ], + "version": "==3.0.2" }, - "pycodestyle": { + "requests": { "hashes": [ - "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56", - "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c" + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" ], - "version": "==2.5.0" + "version": "==2.23.0" }, - "pyparsing": { + "spacy": { "hashes": [ - "sha256:1873c03321fc118f4e9746baf201ff990ceb915f433f23b395f5580d1840cb2a", - "sha256:9b6323ef4ab914af344ba97510e966d64ba91055d6b9afa6b30799340e89cc03" + "sha256:01202066f75c7f2cfeb9c167c3184b5b0a9d465604b0ca553bd9e788353c5905", + "sha256:212314be762bd40dfbbeeba1c4742c242e4b6ea3f9340891f0ff282b2e723ed0", + "sha256:6c1618c05bf65ae4bc94608f2390130ca21112fb3d920d1a03727691e3e7fb1b", + "sha256:7313b4fa921ed997d9719f99f5a375d672d2f4a908c7750033c4b37d9fa8547a", + "sha256:877d8e157a708c8b77c0dea61e526632f6d57f27be64087dac22a4581facea68", + "sha256:c5e6f8155f6b54a8ef89637b3c7d553f0ddb5478c4dd568fde7392efbf8a26c8", + "sha256:ce3886e9bfb9071d2708d2cd7157ada93ab378bbb38cf079842181cd671fc6f9", + "sha256:f0f3a67c5841e6e35d62c98f40ebb3d132587d3aba4f4dccac5056c4e90ff5b9", + "sha256:f75ba238066455f5b5498a987b4e2c84705d92138e02e890e0b0a1d1eb2d9462", + "sha256:fd740cb1b50cd86c648f64313be4734b0c2a2931d83761f46821061f42d791a3" ], - "version": "==2.4.0" + "index": "pypi", + "version": "==2.2.4" + }, + "srsly": { + "hashes": [ + "sha256:18bad26c34cf5a8853fbf018fd168a7bf2ea7ce661e66476c25dac711cb79c9b", + "sha256:2179cf1e88c250e89e40227bd5848341011c170079b3d424987d067de6a73f42", + "sha256:21cfb0e5dea2c4515b5c2daa78402d5782c6425b4f58af40d2e2cb45e4778d8c", + "sha256:29434753a77481ec6129991f4116f983085cc8005c1ad963261124842e8c05fc", + "sha256:3f3975e8cb67194d26dd03508469b1303f8b994f30e7782f7eae25fef6dc4aad", + "sha256:46213d8f094b348a9433c825ac1eba36a21aa25a8bae6f29c2f9f053e15be961", + "sha256:59258b81d567df207f8a0a33c4b5fa232afccf1d927c8ce3ba5395bfd64c0ed8", + "sha256:7c553a709fd56a37a07f969e849f55a0aeabaeb7677bebc588a640ab8ec134aa", + "sha256:95849d84e8929be248a180e672c8ce1ed98b1341263bc983efdf8427465584f1", + "sha256:b94d8a13c60e3298a9ba12b1b211026e8378c7d087efd7ce46a3f2d8d4678d94", + "sha256:c8beff52c104a7ffe4a15513a05dc0497998cf83aa1ca39454489994d18c1c07", + "sha256:d409beb7257208633c974c01f9dc3265562fb6802caee7de21880761ba87c3ed" + ], + "version": "==1.0.2" + }, + "thinc": { + "hashes": [ + "sha256:0522cc8b7a74e1de0902b55e1f141f889a088565f72ea0042a9c0f7f3ce83879", + "sha256:1375c11ed4f7c7178a5749e17b2f3bb1644c98ecc8874e402aceaeec63df6297", + "sha256:23b77994be3376cd8efa85adfa1bcf0ffcb4cfd279f48a3ab842570f419334ca", + "sha256:2aa4cab69067f9dbe4ed7a1d937a4467edcc5f50d43996fba8c645f08ab1f387", + "sha256:523e9be1bfaa3ed1d03d406ce451b6b4793a9719d5b83d2ea6b3398b96bc58b8", + "sha256:5ac162b010f21f8fcc3fd10766025fad3ec670f6b2e0a72284912332d1ae292a", + "sha256:7bb69a8cace8d85a3f65d94176f381c5216df08d79a520b005653d0a23f523a8", + "sha256:9c40101f3148405cb291be2033758d011d348a5dea5d151811def8d1e466f25a", + "sha256:a7332e323b76d63e1cfd2e6bc08a5527c5a6a0eba39197c56af8fe6eef62ef69", + "sha256:d1ee60d44ee840b75c0c0a3ade70908f05f414a65f20082483a5a5bfe82e9497", + "sha256:ebb81b7ff8f852aae1b9c26dfb629344ab962e221ec87c83b2a7c4aec337477d", + "sha256:f3c5786238991925694aba81fa305c1f2290a960fe5428a26b6f82134b260ad1" + ], + "version": "==7.4.0" }, - "pytest": { + "tqdm": { "hashes": [ - "sha256:6032845e68a17a96e8da3088037f899b56357769a724122056265ca2ea1890ee", - "sha256:bea27a646a3d74cbbcf8d3d4a06b2dfc336baf3dc2cc85cf70ad0157e73e8322" + "sha256:07c06493f1403c1380b630ae3dcbe5ae62abcf369a93bbc052502279f189ab8c", + "sha256:cd140979c2bebd2311dfb14781d8f19bd5a9debb92dcab9f6ef899c987fcf71f" ], - "index": "pypi", - "version": "==4.6.2" + "version": "==4.46.1" }, - "six": { + "urllib3": { "hashes": [ - "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", - "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527", + "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115" ], - "version": "==1.12.0" + "version": "==1.25.9" }, - "wcwidth": { + "wasabi": { "hashes": [ - "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", - "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + "sha256:b8dd3e963cd693fde1eb6bfbecf51790171aa3534fa299faf35cf269f2fd6063", + "sha256:da1f100e0025fe1e50fd67fa5b0b05df902187d5c65c86dc110974ab856d1f05" ], - "version": "==0.1.7" + "version": "==0.6.0" }, "zipp": { "hashes": [ - "sha256:8c1019c6aad13642199fbe458275ad6a84907634cc9f0989877ccc4a2840139d", - "sha256:ca943a7e809cc12257001ccfb99e3563da9af99d52f261725e96dfe0f9275bc3" + "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b", + "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96" ], - "version": "==0.5.1" + "version": "==3.1.0" } }, "develop": { @@ -126,6 +260,13 @@ "markers": "sys_platform == 'darwin'", "version": "==0.1.0" }, + "attrs": { + "hashes": [ + "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", + "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" + ], + "version": "==19.3.0" + }, "backcall": { "hashes": [ "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", @@ -135,17 +276,17 @@ }, "bleach": { "hashes": [ - "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16", - "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa" + "sha256:2bce3d8fab545a6528c8fa5d9f9ae8ebc85a56da365c7f85180bfe96a35ef22f", + "sha256:3c4c520fdb9db59ef139915a5db79f8b51bc2a7257ea0389f30c846883430a4b" ], - "version": "==3.1.0" + "version": "==3.1.5" }, "certifi": { "hashes": [ - "sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5", - "sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae" + "sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304", + "sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519" ], - "version": "==2019.3.9" + "version": "==2020.4.5.1" }, "chardet": { "hashes": [ @@ -154,42 +295,85 @@ ], "version": "==3.0.4" }, + "coverage": { + "hashes": [ + "sha256:00f1d23f4336efc3b311ed0d807feb45098fc86dee1ca13b3d6768cdab187c8a", + "sha256:01333e1bd22c59713ba8a79f088b3955946e293114479bbfc2e37d522be03355", + "sha256:0cb4be7e784dcdc050fc58ef05b71aa8e89b7e6636b99967fadbdba694cf2b65", + "sha256:0e61d9803d5851849c24f78227939c701ced6704f337cad0a91e0972c51c1ee7", + "sha256:1601e480b9b99697a570cea7ef749e88123c04b92d84cedaa01e117436b4a0a9", + "sha256:2742c7515b9eb368718cd091bad1a1b44135cc72468c731302b3d641895b83d1", + "sha256:2d27a3f742c98e5c6b461ee6ef7287400a1956c11421eb574d843d9ec1f772f0", + "sha256:402e1744733df483b93abbf209283898e9f0d67470707e3c7516d84f48524f55", + "sha256:5c542d1e62eece33c306d66fe0a5c4f7f7b3c08fecc46ead86d7916684b36d6c", + "sha256:5f2294dbf7875b991c381e3d5af2bcc3494d836affa52b809c91697449d0eda6", + "sha256:6402bd2fdedabbdb63a316308142597534ea8e1895f4e7d8bf7476c5e8751fef", + "sha256:66460ab1599d3cf894bb6baee8c684788819b71a5dc1e8fa2ecc152e5d752019", + "sha256:782caea581a6e9ff75eccda79287daefd1d2631cc09d642b6ee2d6da21fc0a4e", + "sha256:79a3cfd6346ce6c13145731d39db47b7a7b859c0272f02cdb89a3bdcbae233a0", + "sha256:7a5bdad4edec57b5fb8dae7d3ee58622d626fd3a0be0dfceda162a7035885ecf", + "sha256:8fa0cbc7ecad630e5b0f4f35b0f6ad419246b02bc750de7ac66db92667996d24", + "sha256:a027ef0492ede1e03a8054e3c37b8def89a1e3c471482e9f046906ba4f2aafd2", + "sha256:a3f3654d5734a3ece152636aad89f58afc9213c6520062db3978239db122f03c", + "sha256:a82b92b04a23d3c8a581fc049228bafde988abacba397d57ce95fe95e0338ab4", + "sha256:acf3763ed01af8410fc36afea23707d4ea58ba7e86a8ee915dfb9ceff9ef69d0", + "sha256:adeb4c5b608574a3d647011af36f7586811a2c1197c861aedb548dd2453b41cd", + "sha256:b83835506dfc185a319031cf853fa4bb1b3974b1f913f5bb1a0f3d98bdcded04", + "sha256:bb28a7245de68bf29f6fb199545d072d1036a1917dca17a1e75bbb919e14ee8e", + "sha256:bf9cb9a9fd8891e7efd2d44deb24b86d647394b9705b744ff6f8261e6f29a730", + "sha256:c317eaf5ff46a34305b202e73404f55f7389ef834b8dbf4da09b9b9b37f76dd2", + "sha256:dbe8c6ae7534b5b024296464f387d57c13caa942f6d8e6e0346f27e509f0f768", + "sha256:de807ae933cfb7f0c7d9d981a053772452217df2bf38e7e6267c9cbf9545a796", + "sha256:dead2ddede4c7ba6cb3a721870f5141c97dc7d85a079edb4bd8d88c3ad5b20c7", + "sha256:dec5202bfe6f672d4511086e125db035a52b00f1648d6407cc8e526912c0353a", + "sha256:e1ea316102ea1e1770724db01998d1603ed921c54a86a2efcb03428d5417e489", + "sha256:f90bfc4ad18450c80b024036eaf91e4a246ae287701aaa88eaebebf150868052" + ], + "version": "==5.1" + }, "decorator": { "hashes": [ - "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", - "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" + "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760", + "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7" ], - "version": "==4.4.0" + "version": "==4.4.2" }, "docutils": { "hashes": [ - "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", - "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274", - "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6" + "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af", + "sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc" ], - "version": "==0.14" + "version": "==0.16" }, "idna": { "hashes": [ - "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", - "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb", + "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa" + ], + "version": "==2.9" + }, + "importlib-metadata": { + "hashes": [ + "sha256:0505dd08068cfec00f53a74a0ad927676d7757da81b7436a6eefe4c7cf75c545", + "sha256:15ec6c0fd909e893e3a08b3a7c76ecb149122fb14b7efe1199ddd4c7c57ea958" ], - "version": "==2.8" + "markers": "python_version < '3.8'", + "version": "==1.6.1" }, "ipdb": { "hashes": [ - "sha256:dce2112557edfe759742ca2d0fee35c59c97b0cc7a05398b791079d78f1519ce" + "sha256:77fb1c2a6fccdfee0136078c9ed6fe547ab00db00bebff181f1e8c9e13418d49" ], "index": "pypi", - "version": "==0.12" + "version": "==0.13.2" }, "ipython": { "hashes": [ - "sha256:54c5a8aa1eadd269ac210b96923688ccf01ebb2d0f21c18c3c717909583579a8", - "sha256:e840810029224b56cd0d9e7719dc3b39cf84d577f8ac686547c8ba7a06eeab26" + "sha256:0ef1433879816a960cd3ae1ae1dc82c64732ca75cec8dab5a4e29783fb571d0e", + "sha256:1b85d65632211bf5d3e6f1406f3393c8c429a47d7b947b9a87812aa5bce6595c" ], "index": "pypi", - "version": "==7.5.0" + "version": "==7.15.0" }, "ipython-genutils": { "hashes": [ @@ -200,25 +384,46 @@ }, "jedi": { "hashes": [ - "sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b", - "sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c" + "sha256:cd60c93b71944d628ccac47df9a60fec53150de53d42dc10a7fc4b5ba6aae798", + "sha256:df40c97641cb943661d2db4c33c2e1ff75d491189423249e989bcea4464f3030" + ], + "version": "==0.17.0" + }, + "keyring": { + "hashes": [ + "sha256:3401234209015144a5d75701e71cb47239e552b0882313e9f51e8976f9e27843", + "sha256:c53e0e5ccde3ad34284a40ce7976b5b3a3d6de70344c3f8ee44364cc340976ec" + ], + "version": "==21.2.1" + }, + "more-itertools": { + "hashes": [ + "sha256:558bb897a2232f5e4f8e2399089e35aecb746e1f9191b6584a151647e89267be", + "sha256:7818f596b1e87be009031c7653d01acc46ed422e6656b394b0f765ce66ed4982" + ], + "version": "==8.3.0" + }, + "packaging": { + "hashes": [ + "sha256:4357f74f47b9c12db93624a82154e9b120fa8293699949152b22065d556079f8", + "sha256:998416ba6962ae7fbd6596850b80e17859a5753ba17c32284f67bfff33784181" ], - "version": "==0.13.3" + "version": "==20.4" }, "parso": { "hashes": [ - "sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33", - "sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376" + "sha256:158c140fc04112dc45bca311633ae5033c2c2a7b732fa33d0955bad8152a8dd0", + "sha256:908e9fae2144a076d72ae4e25539143d40b8e3eafbaeae03c1bfe226f4cdf12c" ], - "version": "==0.4.0" + "version": "==0.7.0" }, "pexpect": { "hashes": [ - "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", - "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" + "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937", + "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c" ], "markers": "sys_platform != 'win32'", - "version": "==4.7.0" + "version": "==4.8.0" }, "pickleshare": { "hashes": [ @@ -234,13 +439,19 @@ ], "version": "==1.5.0.1" }, + "pluggy": { + "hashes": [ + "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", + "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" + ], + "version": "==0.13.1" + }, "prompt-toolkit": { "hashes": [ - "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780", - "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1", - "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55" + "sha256:a402e9bf468b63314e37460b68ba68243d55b2f8c4d0192f85a019af3945050e", + "sha256:c93e53af97f630f12f5f62a3274e79527936ed466f038953dfa379d4941f651a" ], - "version": "==2.0.9" + "version": "==3.0.3" }, "ptyprocess": { "hashes": [ @@ -249,26 +460,56 @@ ], "version": "==0.6.0" }, + "py": { + "hashes": [ + "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa", + "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0" + ], + "version": "==1.8.1" + }, "pygments": { "hashes": [ - "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127", - "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297" + "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44", + "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324" + ], + "version": "==2.6.1" + }, + "pyparsing": { + "hashes": [ + "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1", + "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b" + ], + "version": "==2.4.7" + }, + "pytest": { + "hashes": [ + "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1", + "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8" + ], + "index": "pypi", + "version": "==5.4.3" + }, + "pytest-cov": { + "hashes": [ + "sha256:b6a814b8ed6247bd81ff47f038511b57fe1ce7f4cc25b9106f1a4b106f1d9322", + "sha256:c87dfd8465d865655a8213859f1b4749b43448b5fae465cb981e16d52a811424" ], - "version": "==2.4.2" + "index": "pypi", + "version": "==2.9.0" }, "readme-renderer": { "hashes": [ - "sha256:bb16f55b259f27f75f640acf5e00cf897845a8b3e4731b5c1a436e4b8529202f", - "sha256:c8532b79afc0375a85f10433eca157d6b50f7d6990f337fa498c96cd4bfc203d" + "sha256:cbe9db71defedd2428a1589cdc545f9bd98e59297449f69d721ef8f1cfced68d", + "sha256:cc4957a803106e820d05d14f71033092537a22daa4f406dfbdd61177e0936376" ], - "version": "==24.0" + "version": "==26.0" }, "requests": { "hashes": [ - "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", - "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" ], - "version": "==2.22.0" + "version": "==2.23.0" }, "requests-toolbelt": { "hashes": [ @@ -279,46 +520,46 @@ }, "six": { "hashes": [ - "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", - "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", + "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" ], - "version": "==1.12.0" + "version": "==1.15.0" }, "tqdm": { "hashes": [ - "sha256:0a860bf2683fdbb4812fe539a6c22ea3f1777843ea985cb8c3807db448a0f7ab", - "sha256:e288416eecd4df19d12407d0c913cbf77aa8009d7fddb18f632aded3bdbdda6b" + "sha256:07c06493f1403c1380b630ae3dcbe5ae62abcf369a93bbc052502279f189ab8c", + "sha256:cd140979c2bebd2311dfb14781d8f19bd5a9debb92dcab9f6ef899c987fcf71f" ], - "version": "==4.32.1" + "version": "==4.46.1" }, "traitlets": { "hashes": [ - "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", - "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" + "sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44", + "sha256:d023ee369ddd2763310e4c3eae1ff649689440d4ae59d7485eb4cfbbe3e359f7" ], - "version": "==4.3.2" + "version": "==4.3.3" }, "twine": { "hashes": [ - "sha256:0fb0bfa3df4f62076cab5def36b1a71a2e4acb4d1fa5c97475b048117b1a6446", - "sha256:d6c29c933ecfc74e9b1d9fa13aa1f87c5d5770e119f5a4ce032092f0ff5b14dc" + "sha256:c1af8ca391e43b0a06bbc155f7f67db0bf0d19d284bfc88d1675da497a946124", + "sha256:d561a5e511f70275e5a485a6275ff61851c16ffcb3a95a602189161112d9f160" ], "index": "pypi", - "version": "==1.13.0" + "version": "==3.1.1" }, "urllib3": { "hashes": [ - "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", - "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232" + "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527", + "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115" ], - "version": "==1.25.3" + "version": "==1.25.9" }, "wcwidth": { "hashes": [ - "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", - "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + "sha256:980fbf4f3c196c0f329cdcd1e84c554d6a211f18e252e525a0cf4223154a41d6", + "sha256:edbc2b718b4db6cdf393eefe3a420183947d6aa312505ce6754516f458ff8830" ], - "version": "==0.1.7" + "version": "==0.2.3" }, "webencodings": { "hashes": [ @@ -327,13 +568,12 @@ ], "version": "==0.5.1" }, - "yapf": { + "zipp": { "hashes": [ - "sha256:34f6f80c446dcb2c44bd644c4037a2024b6645e293a4c9c4521983dd0bb247a1", - "sha256:613deba14233623ff3432d9d5032631b5f600be97b39f66932cbe67648bfa8ea" + "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b", + "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96" ], - "index": "pypi", - "version": "==0.27.0" + "version": "==3.1.0" } } } diff --git a/README.md b/README.md index 578a75e..a636df6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # pySBD: Python Sentence Boundary Disambiguation (SBD) -[![Build Status](https://travis-ci.org/nipunsadvilkar/pySBD.svg?branch=master)](https://travis-ci.org/nipunsadvilkar/pySBD) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/nipunsadvilkar/pySBD/blob/master/LICENSE) [![PyPi](https://img.shields.io/pypi/v/pysbd?color=blue&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/pysbd) [![GitHub](https://img.shields.io/github/v/release/nipunsadvilkar/pySBD.svg?include_prereleases&logo=github&style=flat)](https://github.com/nipunsadvilkar/pySBD) +![Python package](https://github.com/nipunsadvilkar/pySBD/workflows/Python%20package/badge.svg) [![codecov](https://codecov.io/gh/nipunsadvilkar/pySBD/branch/master/graph/badge.svg)](https://codecov.io/gh/nipunsadvilkar/pySBD) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/nipunsadvilkar/pySBD/blob/master/LICENSE) [![PyPi](https://img.shields.io/pypi/v/pysbd?color=blue&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/pysbd) [![GitHub](https://img.shields.io/github/v/release/nipunsadvilkar/pySBD.svg?include_prereleases&logo=github&style=flat)](https://github.com/nipunsadvilkar/pySBD) pySBD - python Sentence Boundary Disambiguation (SBD) - is a rule-based sentence boundary detection module that works out-of-the-box. diff --git a/pysbd/abbreviation_replacer.py b/pysbd/abbreviation_replacer.py index 03088a1..a00e52e 100644 --- a/pysbd/abbreviation_replacer.py +++ b/pysbd/abbreviation_replacer.py @@ -2,10 +2,6 @@ import re from pysbd.utils import Text -# TODO: SENTENCE_STARTERS should be lang specific -from pysbd.lang.standard import Abbreviation, SENTENCE_STARTERS -from pysbd.lang.common.numbers import Common, SingleLetterAbbreviationRules, AmPmRules - def replace_pre_number_abbr(txt, abbr): # prepend a space to avoid needing another regex for start of string @@ -40,28 +36,27 @@ def replace_period_of_abbr(txt, abbr): return txt -def replace_abbreviation_as_sentence_boundary(txt): - sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in SENTENCE_STARTERS)) - regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters) - txt = re.sub(regex, '\\1.', txt) - return txt - - class AbbreviationReplacer(object): - def __init__(self, text, language="en"): + def __init__(self, text, lang): self.text = text - self.language = language + self.lang = lang def replace(self): self.text = Text(self.text).apply( - Common.PossessiveAbbreviationRule, - Common.KommanditgesellschaftRule, - *SingleLetterAbbreviationRules.All + self.lang.PossessiveAbbreviationRule, + self.lang.KommanditgesellschaftRule, + *self.lang.SingleLetterAbbreviationRules.All ) self.text = self.search_for_abbreviations_in_string() self.replace_multi_period_abbreviations() - self.text = Text(self.text).apply(*AmPmRules.All) - self.text = replace_abbreviation_as_sentence_boundary(self.text) + self.text = Text(self.text).apply(*self.lang.AmPmRules.All) + self.text = self.replace_abbreviation_as_sentence_boundary() + return self.text + + def replace_abbreviation_as_sentence_boundary(self): + sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS)) + regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters) + self.text = re.sub(regex, '\\1.', self.text) return self.text def replace_multi_period_abbreviations(self): @@ -71,7 +66,7 @@ def mpa_replace(match): return match self.text = re.sub( - Common.MULTI_PERIOD_ABBREVIATION_REGEX, + self.lang.MULTI_PERIOD_ABBREVIATION_REGEX, mpa_replace, self.text, flags=re.IGNORECASE, @@ -80,7 +75,7 @@ def mpa_replace(match): def search_for_abbreviations_in_string(self): original = self.text lowered = original.lower() - for abbr in Abbreviation.ABBREVIATIONS: + for abbr in self.lang.Abbreviation.ABBREVIATIONS: stripped = abbr.strip() if stripped not in lowered: continue @@ -102,8 +97,8 @@ def scan_for_replacements(self, txt, am, ind, char_array): char = char_array[ind] except IndexError: char = "" - prepositive = Abbreviation.PREPOSITIVE_ABBREVIATIONS - number_abbr = Abbreviation.NUMBER_ABBREVIATIONS + prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS + number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS upper = str(char).isupper() if not upper or am.strip().lower() in prepositive: if am.strip().lower() in prepositive: @@ -113,8 +108,3 @@ def scan_for_replacements(self, txt, am, ind, char_array): else: txt = replace_period_of_abbr(txt, am) return txt - - -if __name__ == "__main__": - s = "fig. ??" - print(AbbreviationReplacer(s).replace()) diff --git a/pysbd/about.py b/pysbd/about.py index 086ebbf..e73ef72 100644 --- a/pysbd/about.py +++ b/pysbd/about.py @@ -2,7 +2,7 @@ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ __title__ = "pysbd" -__version__ = "0.2.3" +__version__ = "0.3.0rc" __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages." __uri__ = "http://nipunsadvilkar.github.io/" __author__ = "Nipun Sadvilkar" diff --git a/pysbd/between_punctuation.py b/pysbd/between_punctuation.py index d81aef4..0dbaf1b 100644 --- a/pysbd/between_punctuation.py +++ b/pysbd/between_punctuation.py @@ -92,8 +92,3 @@ def sub_punctuation_between_em_dashes(self, txt): def sub_punctuation_between_quotes_slanted(self, txt): return re.sub(self.BETWEEN_QUOTE_SLANTED_REGEX_2, replace_punctuation, txt) - - -if __name__ == "__main__": - text = "Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention" - print(BetweenPunctuation(text).replace()) diff --git a/pysbd/cleaner.py b/pysbd/cleaner.py index 1ef0a97..57e50a0 100644 --- a/pysbd/cleaner.py +++ b/pysbd/cleaner.py @@ -2,14 +2,13 @@ import re from pysbd.utils import Text from pysbd.clean.rules import PDF, HTML, CleanRules as cr -from pysbd.lang.standard import Abbreviation class Cleaner(object): - def __init__(self, text, language='common', doc_type=None): + def __init__(self, text, lang, doc_type=None): self.text = text - self.language = language + self.lang = lang self.doc_type = doc_type def clean(self): @@ -96,7 +95,7 @@ def search_for_connected_sentences(self, word, txt, regex, rule): return txt if any(k in word for k in cr.URL_EMAIL_KEYWORDS): return txt - if any(a in word for a in Abbreviation.ABBREVIATIONS): + if any(a in word for a in self.lang.Abbreviation.ABBREVIATIONS): return txt new_word = Text(word).apply(rule) txt = re.sub(re.escape(word), new_word, txt) @@ -112,9 +111,3 @@ def clean_consecutive_characters(self): self.text = Text(self.text).apply( cr.ConsecutivePeriodsRule, cr.ConsecutiveForwardSlashRule) - - -if __name__ == "__main__": - text = "Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot." - c = Cleaner(text) - print(c.clean()) diff --git a/pysbd/exclamation_words.py b/pysbd/exclamation_words.py index f582d36..0940614 100644 --- a/pysbd/exclamation_words.py +++ b/pysbd/exclamation_words.py @@ -16,7 +16,3 @@ def apply_rules(cls, text): return re.sub(ExclamationWords.EXCLAMATION_REGEX, replace_punctuation, text) - -if __name__ == "__main__": - text = "\"Dinah'll miss me very much to-night, I should think!\"ȸ" - print(ExclamationWords.apply_rules(text)) diff --git a/pysbd/lang/chinese.py b/pysbd/lang/chinese.py new file mode 100644 index 0000000..3ef9c31 --- /dev/null +++ b/pysbd/lang/chinese.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +import re +from pysbd.abbreviation_replacer import AbbreviationReplacer +from pysbd.between_punctuation import BetweenPunctuation +from pysbd.lang.common import Common, Standard +from pysbd.punctuation_replacer import replace_punctuation + +class Chinese(Common, Standard): + + iso_code = 'zh' + + class AbbreviationReplacer(AbbreviationReplacer): + SENTENCE_STARTERS = [] + + class BetweenPunctuation(BetweenPunctuation): + + def __init__(self, text): + super().__init__(text) + + def replace(self): + self.sub_punctuation_between_quotes_and_parens() + return self.text + + def sub_punctuation_between_double_angled_quotation_marks(self): + BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = r"《(?=(?P[^》\\]+|\\{2}|\\.)*)(?P=tmp)》" + self.text = re.sub(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX, replace_punctuation, + self.text) + + def sub_punctuation_between_l_bracket(self): + BETWEEN_L_BRACKET_REGEX = r"「(?=(?P[^」\\]+|\\{2}|\\.)*)(?P=tmp)」" + self.text = re.sub(BETWEEN_L_BRACKET_REGEX, replace_punctuation, + self.text) + + def sub_punctuation_between_quotes_and_parens(self): + self.sub_punctuation_between_double_angled_quotation_marks() + self.sub_punctuation_between_l_bracket() diff --git a/pysbd/lang/common/__init__.py b/pysbd/lang/common/__init__.py index e69de29..a5ece4a 100644 --- a/pysbd/lang/common/__init__.py +++ b/pysbd/lang/common/__init__.py @@ -0,0 +1,2 @@ +from .common import Common # noqa: F401 +from .standard import Standard # noqa: F401 diff --git a/pysbd/lang/common/common.py b/pysbd/lang/common/common.py new file mode 100644 index 0000000..32aae62 --- /dev/null +++ b/pysbd/lang/common/common.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +import re +from pysbd.utils import Rule + +class Common(object): + + # added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc. + # TODO: above special cases group can be updated as per developer needs + SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]" + + # # Rubular: http://rubular.com/r/NqCqv372Ix + QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]' + + # # Rubular: http://rubular.com/r/6flGnUMEVl + PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]' + + # # Rubular: http://rubular.com/r/TYzr4qOW1Q + # BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/ + + # # Rubular: http://rubular.com/r/JMjlZHAT4g + SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])' + + # # Rubular: http://rubular.com/r/mQ8Es9bxtk + CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))' + + # https://rubular.com/r/UkumQaILKbkeyc + # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703 + NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])' + + # # Rubular: http://rubular.com/r/yqa4Rit8EY + PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯') + + # # Rubular: http://rubular.com/r/NEv265G2X2 + KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯') + + # # Rubular: http://rubular.com/r/xDkpFZ0EgH + MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]" + + class SingleLetterAbbreviationRules(object): + """Searches for periods within an abbreviation and + replaces the periods. + """ + # Rubular: http://rubular.com/r/e3H6kwnr6H + SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯') + + # Rubular: http://rubular.com/r/gitvf0YWH4 + SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯') + + All = [ + SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule + ] + + class AmPmRules(object): + + # Rubular: http://rubular.com/r/Vnx3m4Spc8 + UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.') + + # Rubular: http://rubular.com/r/AJMCotJVbW + UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.') + + # Rubular: http://rubular.com/r/13q7SnOhgA + LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.') + + # Rubular: http://rubular.com/r/DgUDq4mLz5 + LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.') + + All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule] + + class Numbers(object): + # Rubular: http://rubular.com/r/oNyxBOqbyy + PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯') + + # Rubular: http://rubular.com/r/EMk5MpiUzt + NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯') + + # Rubular: http://rubular.com/r/rf4l1HjtjG + NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯') + + # Rubular: http://rubular.com/r/HPa4sdc6b9 + StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯') + + # Rubular: http://rubular.com/r/NuvWnKleFl + StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯') + + All = [ + PeriodBeforeNumberRule, + NumberAfterPeriodBeforeLetterRule, + NewLineNumberPeriodSpaceLetterRule, + StartLineNumberPeriodRule, + StartLineTwoDigitNumberPeriodRule + ] + diff --git a/pysbd/lang/common/ellipsis.py b/pysbd/lang/common/ellipsis.py deleted file mode 100644 index a77fe5f..0000000 --- a/pysbd/lang/common/ellipsis.py +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- -from pysbd.utils import Rule - - -class EllipsisRules(object): - - # below rules aren't similar to original rules of pragmatic segmenter - # modification: spaces replaced with same number of symbols - # Rubular: http://rubular.com/r/i60hCK81fz - ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.') - - # Rubular: http://rubular.com/r/Hdqpd90owl - FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ') - - # Rubular: http://rubular.com/r/YBG1dIHTRu - ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟') - - # Rubular: http://rubular.com/r/2VvZ8wRbd8 - FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝') - - OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ') - - All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule, - ThreeConsecutiveRule, OtherThreePeriodRule] diff --git a/pysbd/lang/common/numbers.py b/pysbd/lang/common/numbers.py deleted file mode 100644 index ff3295e..0000000 --- a/pysbd/lang/common/numbers.py +++ /dev/null @@ -1,101 +0,0 @@ -# -*- coding: utf-8 -*- -import re -from pysbd.utils import Rule - - -class Common(object): - - # added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc. - # TODO: above special cases group can be updated as per developer needs - SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]" - - # # Rubular: http://rubular.com/r/NqCqv372Ix - QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]' - - # # Rubular: http://rubular.com/r/6flGnUMEVl - PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]' - - # # Rubular: http://rubular.com/r/TYzr4qOW1Q - # BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/ - - # # Rubular: http://rubular.com/r/JMjlZHAT4g - SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])' - - # # Rubular: http://rubular.com/r/mQ8Es9bxtk - CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))' - - # https://rubular.com/r/UkumQaILKbkeyc - # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703 - NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)*\d{1,3}))(\s)(?=[A-Z])' - - # # Rubular: http://rubular.com/r/yqa4Rit8EY - PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯') - - # # Rubular: http://rubular.com/r/NEv265G2X2 - KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯') - - # # Rubular: http://rubular.com/r/xDkpFZ0EgH - MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]" - - -class AmPmRules(object): - - # Rubular: http://rubular.com/r/Vnx3m4Spc8 - UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.') - - # Rubular: http://rubular.com/r/AJMCotJVbW - UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.') - - # Rubular: http://rubular.com/r/13q7SnOhgA - LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.') - - # Rubular: http://rubular.com/r/DgUDq4mLz5 - LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.') - - All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule] - - -class SingleLetterAbbreviationRules(object): - """Searches for periods within an abbreviation and - replaces the periods. - """ - - # Rubular: http://rubular.com/r/e3H6kwnr6H - SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯') - - # Rubular: http://rubular.com/r/gitvf0YWH4 - SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯') - - All = [ - SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule - ] - - -class Numbers(object): - # Rubular: http://rubular.com/r/oNyxBOqbyy - PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯') - - # Rubular: http://rubular.com/r/EMk5MpiUzt - NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯') - - # Rubular: http://rubular.com/r/rf4l1HjtjG - NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯') - - # Rubular: http://rubular.com/r/HPa4sdc6b9 - StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯') - - # Rubular: http://rubular.com/r/NuvWnKleFl - StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯') - - All = [ - PeriodBeforeNumberRule, - NumberAfterPeriodBeforeLetterRule, - NewLineNumberPeriodSpaceLetterRule, - StartLineNumberPeriodRule, - StartLineTwoDigitNumberPeriodRule - ] - - -if __name__ == "__main__": - txt = 'My friend work at Yahoo☄ amazing right?' - print(re.findall(Common.SENTENCE_BOUNDARY_REGEX, txt)) diff --git a/pysbd/lang/common/standard.py b/pysbd/lang/common/standard.py new file mode 100644 index 0000000..84ae080 --- /dev/null +++ b/pysbd/lang/common/standard.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +from pysbd.utils import Rule +from pysbd.abbreviation_replacer import AbbreviationReplacer + +class Standard: + + # This class holds the punctuation marks. + Punctuations = ['。', '.', '.', '!', '!', '?', '?'] + + # Rubular: http://rubular.com/r/G2opjedIm9 + GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯') + + FileFormatRule = Rule(r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '∯') + + SingleNewLineRule = Rule(r'\n', 'ȹ') + + # Rubular: http://rubular.com/r/aXPUGm6fQh + QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&') + + ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ') + + SubSingleQuoteRule = Rule(r'&⎋&', "'") + + class Abbreviation(object): + """Defines the abbreviations for each language (if available)""" + ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig'] + PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig'] + NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp'] + + # Part of "Abbreviations" ruby module + # Rubular: http://rubular.com/r/EUbZCNfgei + WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3') + + class DoublePunctuationRules(object): + FirstRule = Rule(r'\?!', '☉') + SecondRule = Rule(r'!\?', '☈') + ThirdRule = Rule(r'\?\?', '☇') + ForthRule = Rule(r'!!', '☄') + DoublePunctuation = r'\?!|!\?|\?\?|!!' + All = [FirstRule, SecondRule, ThirdRule, ForthRule] + + class ExclamationPointRules(object): + # Rubular: http://rubular.com/r/XS1XXFRfM2 + InQuotationRule = Rule(r'\!(?=(\'|\"))', '&ᓴ&') + + # Rubular: http://rubular.com/r/sl57YI8LkA + BeforeCommaMidSentenceRule = Rule(r'\!(?=\,\s[a-z])', '&ᓴ&') + + # Rubular: http://rubular.com/r/f9zTjmkIPb + MidSentenceRule = Rule(r'\!(?=\s[a-z])', '&ᓴ&') + + All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule] + + class SubSymbolsRules(object): + Period = Rule(r'∯', '.') + ArabicComma = Rule(r'♬', '،') + SemiColon = Rule(r'♭', ':') + FullWidthPeriod = Rule(r'&ᓰ&', '。') + SpecialPeriod = Rule(r'&ᓱ&', '.') + FullWidthExclamation = Rule(r'&ᓳ&', '!') + ExclamationPoint = Rule(r'&ᓴ&', '!') + QuestionMark = Rule(r'&ᓷ&', '?') + FullWidthQuestionMark = Rule(r'&ᓸ&', '?') + MixedDoubleQE = Rule(r'☉', '?!') + MixedDoubleQQ = Rule(r'☇', '??') + MixedDoubleEQ = Rule(r'☈', '!?') + MixedDoubleEE = Rule(r'☄', '!!') + LeftParens = Rule(r'&✂&', '(') + RightParens = Rule(r'&⌬&', ')') + TemporaryEndingPunctutation = Rule(r'ȸ', '') + Newline = Rule(r'ȹ', "\n") + All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod, + FullWidthExclamation, ExclamationPoint, QuestionMark, + FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ, + MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation, + Newline] + + class EllipsisRules(object): + + # below rules aren't similar to original rules of pragmatic segmenter + # modification: spaces replaced with same number of symbols + # Rubular: http://rubular.com/r/i60hCK81fz + ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.') + + # Rubular: http://rubular.com/r/Hdqpd90owl + FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ') + + # Rubular: http://rubular.com/r/YBG1dIHTRu + ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟') + + # Rubular: http://rubular.com/r/2VvZ8wRbd8 + FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝') + + OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ') + + All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule, + ThreeConsecutiveRule, OtherThreePeriodRule] + + class ReinsertEllipsisRules(object): + # below rules aren't similar to original rules of pragmatic segmenter + # modification: symbols replaced with same number of ellipses + SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...') + SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ') + SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .') + SubTwoConsecutivePeriod = Rule(r'☏☏', '..') + SubOnePeriod = Rule(r'∮', '.') + All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod, + SubTwoConsecutivePeriod, SubOnePeriod] + + class AbbreviationReplacer(AbbreviationReplacer): + SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\ + "More She That The There They We What When Where Who Why".split(" ") diff --git a/pysbd/lang/english.py b/pysbd/lang/english.py index 765ba4a..f4cb10f 100644 --- a/pysbd/lang/english.py +++ b/pysbd/lang/english.py @@ -1,25 +1,11 @@ # -*- coding: utf-8 -*- -from pysbd.lang.common.numbers import Common -from pysbd.cleaner import Cleaner -# from pysbd.abbreviation_replacer import AbbreviationReplacer +from pysbd.abbreviation_replacer import AbbreviationReplacer +from pysbd.lang.common import Common, Standard +class English(Common, Standard): -class English(Common): + iso_code = 'en' - def __init__(self): - pass - - def clear_quotations(self, text): - raise NotImplementedError - - def abbreviations(self, text): - raise NotImplementedError - - -# class EnAbbreviationReplacer(AbbreviationReplacer): -# raise NotImplementedError - - -if __name__ == "__main__": - ec = English() - print(hasattr(ec, 'PARENS_BETWEEN_DOUBLE_QUOTES_REGEX')) + class AbbreviationReplacer(AbbreviationReplacer): + SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\ + "More She That The There They We What When Where Who Why".split(" ") diff --git a/pysbd/lang/hindi.py b/pysbd/lang/hindi.py new file mode 100644 index 0000000..e29bf38 --- /dev/null +++ b/pysbd/lang/hindi.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +from pysbd.abbreviation_replacer import AbbreviationReplacer +from pysbd.lang.common import Common, Standard + +class Hindi(Common, Standard): + + iso_code = 'hi' + + SENTENCE_BOUNDARY_REGEX = r'.*?[।\|!\?]|.*?$' + Punctuations = ['।', '|', '.', '!', '?'] + + class AbbreviationReplacer(AbbreviationReplacer): + SENTENCE_STARTERS = [] diff --git a/pysbd/lang/marathi.py b/pysbd/lang/marathi.py new file mode 100644 index 0000000..adc8313 --- /dev/null +++ b/pysbd/lang/marathi.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +# Grammer rules from https://gopract.com/Pages/Marathi-Grammar-Viramchinah.aspx +from pysbd.abbreviation_replacer import AbbreviationReplacer +from pysbd.lang.common import Common, Standard + +class Marathi(Common, Standard): + + iso_code = 'mr' + + SENTENCE_BOUNDARY_REGEX = r'.*?[.!?]|.*?$' + Punctuations = ['.', '!', '?'] + + class AbbreviationReplacer(AbbreviationReplacer): + SENTENCE_STARTERS = [] diff --git a/pysbd/lang/spanish.py b/pysbd/lang/spanish.py new file mode 100644 index 0000000..5954b34 --- /dev/null +++ b/pysbd/lang/spanish.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +from pysbd.abbreviation_replacer import AbbreviationReplacer +from pysbd.lang.common import Common, Standard + +class Spanish(Common, Standard): + + iso_code = 'es' + + class AbbreviationReplacer(AbbreviationReplacer): + SENTENCE_STARTERS = [] + + class Abbreviation(Standard.Abbreviation): + ABBREVIATIONS = ['a.c', 'a/c', 'abr', 'adj', 'admón', 'afmo', 'ago', 'almte', 'ap', 'apdo', 'arq', 'art', 'atte', 'av', 'avda', 'bco', 'bibl', 'bs. as', 'c', 'c.f', 'c.g', 'c/c', 'c/u', 'cap', 'cc.aa', 'cdad', 'cm', 'co', 'cra', 'cta', 'cv', 'd.e.p', 'da', 'dcha', 'dcho', 'dep', 'dic', 'dicc', 'dir', 'dn', 'doc', 'dom', 'dpto', 'dr', 'dra', 'dto', 'ee', 'ej', 'en', 'entlo', 'esq', 'etc', 'excmo', 'ext', 'f.c', 'fca', 'fdo', 'febr', 'ff. aa', 'ff.cc', 'fig', 'fil', 'fra', 'g.p', 'g/p', 'gob', 'gr', 'gral', 'grs', 'hnos', 'hs', 'igl', 'iltre', 'imp', 'impr', 'impto', 'incl', 'ing', 'inst', 'izdo', 'izq', 'izqdo', 'j.c', 'jue', 'jul', 'jun', 'kg', 'km', 'lcdo', 'ldo', 'let', 'lic', 'ltd', 'lun', 'mar', 'may', 'mg', 'min', 'mié', 'mm', 'máx', 'mín', 'mt', 'n. del t', 'n.b', 'no', 'nov', 'ntra. sra', 'núm', 'oct', 'p', 'p.a', 'p.d', 'p.ej', 'p.v.p', 'párrf', 'ppal', 'prev', 'prof', 'prov', 'ptas', 'pts', 'pza', 'pág', 'págs', 'párr', 'q.e.g.e', 'q.e.p.d', 'q.e.s.m', 'reg', 'rep', 'rr. hh', 'rte', 's', 's. a', 's.a.r', 's.e', 's.l', 's.r.c', 's.r.l', 's.s.s', 's/n', 'sdad', 'seg', 'sept', 'sig', 'sr', 'sra', 'sres', 'srta', 'sta', 'sto', 'sáb', 't.v.e', 'tamb', 'tel', 'tfno', 'ud', 'uu', 'uds', 'univ', 'v.b', 'v.e', 'vd', 'vds', 'vid', 'vie', 'vol', 'vs', 'vto', 'a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta'] + PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta'] + NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel'] diff --git a/pysbd/lang/standard.py b/pysbd/lang/standard.py deleted file mode 100644 index a795b29..0000000 --- a/pysbd/lang/standard.py +++ /dev/null @@ -1,96 +0,0 @@ -# -*- coding: utf-8 -*- -from pysbd.utils import Rule - - -class Standard(object): - - # This class holds the punctuation marks. - Punctuations = ['。', '.', '.', '!', '!', '?', '?'] - - # Rubular: http://rubular.com/r/G2opjedIm9 - GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯') - - FileFormatRule = Rule(r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '∯') - - SingleNewLineRule = Rule(r'\n', 'ȹ') - - # Rubular: http://rubular.com/r/aXPUGm6fQh - QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&') - - ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ') - - SubSingleQuoteRule = Rule(r'&⎋&', "'") - - -class Abbreviation(object): - """Defines the abbreviations for each language (if available)""" - ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig'] - PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig'] - NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp'] - - - # Part of "Abbreviations" ruby module - # Rubular: http://rubular.com/r/EUbZCNfgei - WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3') - - -class DoublePunctuationRules(object): - FirstRule = Rule(r'\?!', '☉') - SecondRule = Rule(r'!\?', '☈') - ThirdRule = Rule(r'\?\?', '☇') - ForthRule = Rule(r'!!', '☄') - DoublePunctuation = r'\?!|!\?|\?\?|!!' - All = [FirstRule, SecondRule, ThirdRule, ForthRule] - - -class ExclamationPointRules(object): - # Rubular: http://rubular.com/r/XS1XXFRfM2 - InQuotationRule = Rule(r'\!(?=(\'|\"))', '&ᓴ&') - - # Rubular: http://rubular.com/r/sl57YI8LkA - BeforeCommaMidSentenceRule = Rule(r'\!(?=\,\s[a-z])', '&ᓴ&') - - # Rubular: http://rubular.com/r/f9zTjmkIPb - MidSentenceRule = Rule(r'\!(?=\s[a-z])', '&ᓴ&') - - All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule] - - -class SubSymbolsRules(object): - Period = Rule(r'∯', '.') - ArabicComma = Rule(r'♬', '،') - SemiColon = Rule(r'♭', ':') - FullWidthPeriod = Rule(r'&ᓰ&', '。') - SpecialPeriod = Rule(r'&ᓱ&', '.') - FullWidthExclamation = Rule(r'&ᓳ&', '!') - ExclamationPoint = Rule(r'&ᓴ&', '!') - QuestionMark = Rule(r'&ᓷ&', '?') - FullWidthQuestionMark = Rule(r'&ᓸ&', '?') - MixedDoubleQE = Rule(r'☉', '?!') - MixedDoubleQQ = Rule(r'☇', '??') - MixedDoubleEQ = Rule(r'☈', '!?') - MixedDoubleEE = Rule(r'☄', '!!') - LeftParens = Rule(r'&✂&', '(') - RightParens = Rule(r'&⌬&', ')') - TemporaryEndingPunctutation = Rule(r'ȸ', '') - Newline = Rule(r'ȹ', "\n") - All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod, - FullWidthExclamation, ExclamationPoint, QuestionMark, - FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ, - MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation, - Newline] - - -class ReinsertEllipsisRules(object): - # below rules aren't similar to original rules of pragmatic segmenter - # modification: symbols replaced with same number of ellipses - SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...') - SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ') - SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .') - SubTwoConsecutivePeriod = Rule(r'☏☏', '..') - SubOnePeriod = Rule(r'∮', '.') - All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod, - SubTwoConsecutivePeriod, SubOnePeriod] - - -SENTENCE_STARTERS = "A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why".split(" ") diff --git a/pysbd/languages.py b/pysbd/languages.py index e796ae6..e7682df 100644 --- a/pysbd/languages.py +++ b/pysbd/languages.py @@ -1,22 +1,28 @@ # -*- coding: utf-8 -*- -from pysbd.lang.standard import Standard from pysbd.lang.english import English +from pysbd.lang.hindi import Hindi +from pysbd.lang.marathi import Marathi +from pysbd.lang.chinese import Chinese +from pysbd.lang.spanish import Spanish -LANGUAGE_CODES = {'en': English} +LANGUAGE_CODES = { + 'en': English, + 'hi': Hindi, + 'mr': Marathi, + 'zh': Chinese, + 'es': Spanish +} class Language(object): def __init__(self, code): - self.code = LANGUAGE_CODES[code] + self.code = code @classmethod - def get_language_code(self, code): + def get_language_code(cls, code): try: return LANGUAGE_CODES[code] except KeyError: - return Standard - - -if __name__ == "__main__": - print(Language.get_language_code('standard')) + raise ValueError("Provide valid language ID i.e. ISO code. " + "Available codes are : {}".format(set(LANGUAGE_CODES.keys()))) diff --git a/pysbd/lists_item_replacer.py b/pysbd/lists_item_replacer.py index d0a1486..8a70a46 100644 --- a/pysbd/lists_item_replacer.py +++ b/pysbd/lists_item_replacer.py @@ -239,9 +239,3 @@ def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False): each, ind, alphabet, list_array, parens) return self.text - -if __name__ == "__main__": - text = "• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines: \n\n1. Organise your pregnancy care early" - li = ListItemReplacer(text) - li.add_line_break() - print(repr(li.text)) diff --git a/pysbd/processor.py b/pysbd/processor.py index ad79adc..8dd28cb 100644 --- a/pysbd/processor.py +++ b/pysbd/processor.py @@ -1,55 +1,60 @@ # -*- coding: utf-8 -*- import re -import os +import spacy from pysbd.utils import Text, TextSpan from pysbd.lists_item_replacer import ListItemReplacer -from pysbd.languages import Language -from pysbd.lang.standard import (Standard, Abbreviation, - DoublePunctuationRules, - ExclamationPointRules, SubSymbolsRules, - ReinsertEllipsisRules) -from pysbd.lang.common.numbers import Common, Numbers -from pysbd.lang.common.ellipsis import EllipsisRules from pysbd.exclamation_words import ExclamationWords from pysbd.between_punctuation import BetweenPunctuation from pysbd.abbreviation_replacer import AbbreviationReplacer +nlp = spacy.blank('en') class Processor(object): - def __init__(self, text, language='common', char_span=False): + def __init__(self, text, lang, char_span=False): """Process a text - do pre and post processing - to get proper sentences Parameters ---------- text : str Original text - language : str, optional - by default "common" i.e., english text preprocessing + language : object + Language module char_span : bool, optional Get start & end character offsets of each sentences within original text, by default False """ - self.language = language - self.language_module = Language.get_language_code(language) self.text = text + self.lang = lang self.char_span = char_span def process(self): if not self.text: - # return empty list? return self.text + self.doc = nlp(self.text) li = ListItemReplacer(self.text) self.text = li.add_line_break() - self.text = AbbreviationReplacer(self.text).replace() + self.replace_abbreviations() self.replace_numbers() self.replace_continuous_punctuation() self.replace_periods_before_numeric_references() self.text = Text(self.text).apply( - Abbreviation.WithMultiplePeriodsAndEmailRule, - Standard.GeoLocationRule, Standard.FileFormatRule) + self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule, + self.lang.GeoLocationRule, self.lang.FileFormatRule) processed = self.split_into_segments() - return processed + if self.char_span: + return self.sentences_with_char_spans(processed) + else: + return processed + + def sentences_with_char_spans(self, sentences): + sent_start_token_idx = [m.start() for sent in sentences for m in re.finditer(re.escape(sent), self.doc.text)] + for tok in self.doc: + if tok.idx in sent_start_token_idx: + tok.is_sent_start = True + else: + tok.is_sent_start = False + return [TextSpan(sent.text_with_ws, sent.start_char, sent.end_char) for sent in self.doc.sents] def rm_none_flatten(self, sents): """Remove None values and unpack list of list sents @@ -82,32 +87,23 @@ def split_into_segments(self): # remove empty and none values sents = self.rm_none_flatten(sents) sents = [ - Text(s).apply(Standard.SingleNewLineRule, *EllipsisRules.All) + Text(s).apply(self.lang.SingleNewLineRule, *self.lang.EllipsisRules.All) for s in sents ] - sents_w_spans = [self.check_for_punctuation(s) for s in sents] + sents = [self.check_for_punctuation(s) for s in sents] # flatten list of list of sentences - sents_w_spans = self.rm_none_flatten(sents_w_spans) - new_spans = [] - for sent_span in sents_w_spans: - if sent_span.sent.endswith('ȸ'): - sent_span.end = sent_span.end - 1 - sent_span.sent = Text(sent_span.sent).apply(*SubSymbolsRules.All) - post_process_sent = self.post_process_segments(sent_span.sent) + sents = self.rm_none_flatten(sents) + new_sents = [] + for sent in sents: + sent = Text(sent).apply(*self.lang.SubSymbolsRules.All) + post_process_sent = self.post_process_segments(sent) if post_process_sent and isinstance(post_process_sent, str): - sent_span.sent = post_process_sent - new_spans.append(sent_span) + new_sents.append(post_process_sent) elif isinstance(post_process_sent, list): - tmp_char_start = sent_span.start for pps in post_process_sent: - new_spans.append(TextSpan(pps, tmp_char_start, tmp_char_start + len(pps))) - tmp_char_start += len(pps) - for ns in new_spans: - ns.sent = Text(ns.sent).apply(Standard.SubSingleQuoteRule) - if self.char_span: - return new_spans - else: - return [s.sent for s in new_spans] + new_sents.append(pps) + new_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule) for ns in new_sents] + return new_sents def post_process_segments(self, txt): if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt): @@ -126,10 +122,10 @@ def post_process_segments(self, txt): # removed to retain original text spans # txt = Text(txt).apply(*ReinsertEllipsisRules.All, # Standard.ExtraWhiteSpaceRule) - txt = Text(txt).apply(*ReinsertEllipsisRules.All) - if re.search(Common.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt): + txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All) + if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt): txt = re.split( - Common.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt) + self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt) return txt else: txt = txt.replace('\n', '') @@ -141,7 +137,7 @@ def paren_replace(match): sub1 = re.sub(r'\s(?=\()', '\r', match) sub2 = re.sub(r'(?<=\))\s', '\r', sub1) return sub2 - self.text = re.sub(Common.PARENS_BETWEEN_DOUBLE_QUOTES_REGEX, + self.text = re.sub(self.lang.PARENS_BETWEEN_DOUBLE_QUOTES_REGEX, paren_replace, self.text) def replace_continuous_punctuation(self): @@ -150,12 +146,12 @@ def continuous_puncs_replace(match): sub1 = re.sub(re.escape('!'), '&ᓴ&', match) sub2 = re.sub(re.escape('?'), '&ᓷ&', sub1) return sub2 - self.text = re.sub(Common.CONTINUOUS_PUNCTUATION_REGEX, + self.text = re.sub(self.lang.CONTINUOUS_PUNCTUATION_REGEX, continuous_puncs_replace, self.text) def replace_periods_before_numeric_references(self): - # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703 - self.text = re.sub(Common.NUMBERED_REFERENCE_REGEX, + # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703 + self.text = re.sub(self.lang.NUMBERED_REFERENCE_REGEX, r"∯\2\r\7", self.text) def consecutive_underscore(self, txt): @@ -164,70 +160,59 @@ def consecutive_underscore(self, txt): return len(txt) == 0 def check_for_punctuation(self, txt): - if any(p in txt for p in Standard.Punctuations): + if any(p in txt for p in self.lang.Punctuations): sents = self.process_text(txt) return sents else: # NOTE: next steps of check_for_punctuation will unpack this list - return TextSpan(txt, 0, len(txt)) - # return [txt] + return [txt] def process_text(self, txt): - if txt[-1] not in Standard.Punctuations: + if txt[-1] not in self.lang.Punctuations: txt += 'ȸ' txt = ExclamationWords.apply_rules(txt) - txt = BetweenPunctuation(txt).replace() + txt = self.between_punctuation(txt) # handle text having only doublepunctuations - if not re.match(DoublePunctuationRules.DoublePunctuation, txt): - txt = Text(txt).apply(*DoublePunctuationRules.All) - txt = Text(txt).apply(Standard.QuestionMarkInQuotationRule, - *ExclamationPointRules.All) + if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt): + txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All) + txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule, + *self.lang.ExclamationPointRules.All) txt = ListItemReplacer(txt).replace_parens() txt = self.sentence_boundary_punctuation(txt) return txt def replace_numbers(self): - self.text = Text(self.text).apply(*Numbers.All) + self.text = Text(self.text).apply(*self.lang.Numbers.All) - def abbreviations_replacer(self, txt): - # AbbreviationReplacer - raise NotImplementedError + def abbreviations_replacer(self): + if hasattr(self.lang, "AbbreviationReplacer"): + return self.lang.AbbreviationReplacer(self.text, self.lang) + else: + return AbbreviationReplacer(self.text, self.lang) - def replace_abbreviations(self, txt): - # abbreviations_replacer - raise NotImplementedError + def replace_abbreviations(self): + self.text = self.abbreviations_replacer().replace() def between_punctuation_processor(self, txt): - # BetweenPunctuation - raise NotImplementedError + if hasattr(self.lang, "BetweenPunctuation"): + return self.lang.BetweenPunctuation(txt) + else: + return BetweenPunctuation(txt) def between_punctuation(self, txt): - # between_punctuation_processor - raise NotImplementedError + txt = self.between_punctuation_processor(txt).replace() + return txt def sentence_boundary_punctuation(self, txt): - if hasattr(self.language_module, 'ReplaceColonBetweenNumbersRule'): + if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'): txt = Text(txt).apply( - self.language_module.ReplaceColonBetweenNumbersRule) - if hasattr(self.language_module, 'ReplaceNonSentenceBoundaryCommaRule'): + self.lang.ReplaceColonBetweenNumbersRule) + if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'): txt = Text(txt).apply( - self.language_module.ReplaceNonSentenceBoundaryCommaRule) + self.lang.ReplaceNonSentenceBoundaryCommaRule) # retain exclamation mark if it is an ending character of a given text txt = re.sub(r'&ᓴ&$', '!', txt) txt = [ - TextSpan(m.group(), m.start(), m.end()) - for m in re.finditer(Common.SENTENCE_BOUNDARY_REGEX, txt) + m.group() for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt) ] return txt - - -if __name__ == "__main__": - text = "Header 1.2; Attachment Z\n\n\td. Compliance Log – Volume 12 \n\tAttachment A\n\n\te. Additional Logistics Data\n\tSection 10" - print("Input String:\n{}".format(text)) - p = Processor(text) - processed_op = p.process() - print("\nProcessed String:\n") - print("Number of sentences: {}\n".format(len(processed_op))) - print(processed_op) - for e in processed_op: - print(e) diff --git a/pysbd/segmenter.py b/pysbd/segmenter.py index c07305d..42a2f6d 100644 --- a/pysbd/segmenter.py +++ b/pysbd/segmenter.py @@ -37,20 +37,8 @@ def segment(self, text): raise ValueError("char_span must be False if clean is True. " "Since `clean=True` will modify original text.") elif self.clean: - text = Cleaner(text, doc_type=self.doc_type).clean() - processor = Processor(text, char_span=self.char_span) + text = Cleaner(text, self.language_module, doc_type=self.doc_type).clean() + processor = Processor(text, lang=self.language_module, char_span=self.char_span) segments = processor.process() return segments - -if __name__ == "__main__": - text = "My name is Jonas E. Smith. Please turn to p. 55." - print("Input String:\n{}".format(text)) - seg = Segmenter(language="en", clean=False, char_span=True) - segments = seg.segment(text) - print("\n################## Processing #######################\n") - print("Number of sentences: {}\n".format(len(segments))) - print("Sentences found:\n{}\n".format(segments)) - print("\n################## Output #######################\n") - for ind, sent in enumerate(segments, start=1): - print("{} -> {}".format(ind, sent)) diff --git a/pysbd/utils.py b/pysbd/utils.py index 9bbd440..c3cd3b8 100644 --- a/pysbd/utils.py +++ b/pysbd/utils.py @@ -10,7 +10,7 @@ def __init__(self, pattern, replacement): self.pattern = pattern self.replacement = replacement - def __repr__(self): + def __repr__(self): # pragma: no cover return '<{} pattern="{}" and replacement="{}">'.format( self.__class__.__name__, self.pattern, self.replacement) @@ -56,7 +56,7 @@ def __init__(self, sent, start, end): self.start = start self.end = end - def __repr__(self): + def __repr__(self): # pragma: no cover return "{0}(sent='{1}', start={2}, end={3})".format( self.__class__.__name__, self.sent, self.start, self.end) @@ -75,21 +75,9 @@ def __init__(self, nlp, language='en', clean=False, char_span=True): char_span=char_span) def __call__(self, doc): - sents_char_spans = self.seg.segment(doc.text) - char_spans = [doc.char_span(sent_span.start, sent_span.end) - for sent_span in sents_char_spans] - start_token_ids = [span[0].idx for span in char_spans if span - is not None] + sents_char_spans = self.seg.segment(doc.text_with_ws) + start_token_ids = [sent.start for sent in sents_char_spans] for token in doc: token.is_sent_start = (True if token.idx in start_token_ids else False) return doc - - -if __name__ == "__main__": - SubstituteListPeriodRule = Rule('♨', '∯') - StdRule = Rule(r'∯', r'∯♨') - more_rules = [Rule(r'∯♨', r'∯∯∯∯'), Rule(r'∯∯∯∯', '♨♨')] - sample_text = Text("I. abcd ♨ acnjfe") - output = sample_text.apply(SubstituteListPeriodRule, StdRule, *more_rules) - print(output) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..61e9f80 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +spacy>=2.2.4 diff --git a/setup.py b/setup.py index bb234c9..7bd7ef4 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,10 @@ root = os.path.abspath(os.path.dirname(__file__)) +REQUIRES_PYTHON = ">=3" +# What packages are required for this module to be executed? +REQUIRED = ["spacy"] + with io.open(os.path.join(root, "pysbd", "about.py"), encoding="utf8") as f: about = {} exec(f.read(), about) @@ -69,8 +73,8 @@ def run(self): author_email=about["__email__"], url=about["__uri__"], packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]), - install_requires=[], - python_requires='>=3', + install_requires=REQUIRED, + python_requires=REQUIRES_PYTHON, include_package_data=True, license=about["__license__"], classifiers=[ diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..0b0a075 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,43 @@ +from pysbd import segmenter +import pytest +import pysbd + +@pytest.fixture() +def pysbd_default_en_no_clean_no_span_fixture(): + en_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False) + return en_segmenter + +@pytest.fixture() +def en_with_clean_no_span_fixture(): + en_segmenter = pysbd.Segmenter(language="en", clean=True, char_span=False) + return en_segmenter + +@pytest.fixture() +def en_no_clean_with_span_fixture(): + en_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True) + return en_segmenter + +@pytest.fixture() +def hi_default_fixture(): + hi_segmenter = pysbd.Segmenter(language="hi", clean=False, char_span=False) + return hi_segmenter + +@pytest.fixture() +def mr_default_fixture(): + mr_segmenter = pysbd.Segmenter(language="mr", clean=False, char_span=False) + return mr_segmenter + +@pytest.fixture() +def zh_default_fixture(): + zh_segmenter = pysbd.Segmenter(language="zh", clean=False, char_span=False) + return zh_segmenter + +@pytest.fixture() +def es_default_fixture(): + es_segmenter = pysbd.Segmenter(language="es", clean=False, char_span=False) + return es_segmenter + +@pytest.fixture() +def es_with_clean_no_span_fixture(): + es_segmenter_clean = pysbd.Segmenter(language="es", clean=True, char_span=False) + return es_segmenter_clean diff --git a/tests/lang/__init__.py b/tests/lang/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/lang/test_chinese.py b/tests/lang/test_chinese.py new file mode 100644 index 0000000..cddd552 --- /dev/null +++ b/tests/lang/test_chinese.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +import pytest + +GOLDEN_ZH_RULES_TEST_CASES = [ + ("安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。", + ["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"]), + ("我们明天一起去看《摔跤吧!爸爸》好吗?好!", + ["我们明天一起去看《摔跤吧!爸爸》好吗?", "好!"]) +] + +@pytest.mark.parametrize('text,expected_sents', GOLDEN_ZH_RULES_TEST_CASES) +def test_zsh_sbd(zh_default_fixture, text, expected_sents): + """Chinese language SBD tests from Pragmatic Segmenter""" + segments = zh_default_fixture.segment(text) + assert segments == expected_sents diff --git a/tests/test_english.py b/tests/lang/test_english.py similarity index 96% rename from tests/test_english.py rename to tests/lang/test_english.py index 374ce65..1b46c96 100644 --- a/tests/test_english.py +++ b/tests/lang/test_english.py @@ -3,7 +3,7 @@ import pysbd -RULES_TEST_CASES = [ +GOLDEN_EN_RULES_TEST_CASES = [ ("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]), ("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]), ("There it is! I found it.", ["There it is!", "I found it."]), @@ -164,9 +164,8 @@ ] -@pytest.mark.parametrize('text,expected_sents', RULES_TEST_CASES) -def test_en_sbd(text, expected_sents): +@pytest.mark.parametrize('text,expected_sents', GOLDEN_EN_RULES_TEST_CASES) +def test_en_sbd(pysbd_default_en_no_clean_no_span_fixture, text, expected_sents): """SBD tests from Pragmatic Segmenter""" - seg = pysbd.Segmenter(language="en", clean=False) - segments = seg.segment(text) + segments = pysbd_default_en_no_clean_no_span_fixture.segment(text) assert segments == expected_sents diff --git a/tests/test_english_clean.py b/tests/lang/test_english_clean.py similarity index 96% rename from tests/test_english_clean.py rename to tests/lang/test_english_clean.py index ef86e39..490cd0e 100644 --- a/tests/test_english_clean.py +++ b/tests/lang/test_english_clean.py @@ -776,21 +776,6 @@ ["DOWN THE RABBIT-HOLE", "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do.", "Once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, \"and what is the use of a book,\" thought Alice, \"without pictures or conversations?\"", "So she was considering in her own mind (as well as she could, for the day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.", "There was nothing so very remarkable in that, nor did Alice think it so very much out of the way to hear the Rabbit say to itself, \"Oh dear! Oh dear! I shall be too late!\"", "But when the Rabbit actually took a watch out of its waistcoat-pocket and looked at it and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and, burning with curiosity, she ran across the field after it and was just in time to see it pop down a large rabbit-hole, under the hedge.", "In another moment, down went Alice after it!", "The rabbit-hole went straight on like a tunnel for some way and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down what seemed to be a very deep well.", "Either the well was very deep, or she fell very slowly, for she had plenty of time, as she went down, to look about her.", "First, she tried to make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed.", "It was labeled \"ORANGE MARMALADE,\" but, to her great disappointment, it was empty; she did not like to drop the jar, so managed to put it into one of the cupboards as she fell past it.", "Down, down, down!", "Would the fall never come to an end?", "There was nothing else to do, so Alice soon began talking to herself.", "\"Dinah'll miss me very much to-night, I should think!\"", "(Dinah was the cat.)", "\"I hope they'll remember her saucer of milk at tea-time. Dinah, my dear, I wish you were down here with me!\"", "Alice felt that she was dozing off, when suddenly, thump! thump! down she came upon a heap of sticks and dry leaves, and the fall was over.", "Alice was not a bit hurt, and she jumped up in a moment.", "She looked up, but it was all dark overhead; before her was another long passage and the White Rabbit was still in sight, hurrying down it.", "There was not a moment to be lost.", "Away went Alice like the wind and was just in time to hear it say, as it turned a corner, \"Oh, my ears and whiskers, how late it's getting!\"", "She was close behind it when she turned the corner, but the Rabbit was no longer to be seen.", "She found herself in a long, low hall, which was lit up by a row of lamps hanging from the roof.", "There were doors all 'round the hall, but they were all locked; and when Alice had been all the way down one side and up the other, trying every door, she walked sadly down the middle, wondering how she was ever to get out again.", "Suddenly she came upon a little table, all made of solid glass.", "There was nothing on it but a tiny golden key, and Alice's first idea was that this might belong to one of the doors of the hall; but, alas! either the locks were too large, or the key was too small, but, at any rate, it would not open any of them.", "However, on the second time 'round, she came upon a low curtain she had not noticed before, and behind it was a little door about fifteen inches high.", "She tried the little golden key in the lock, and to her great delight, it fitted!", "Alice opened the door and found that it led into a small passage, not much larger than a rat-hole; she knelt down and looked along the passage into the loveliest garden you ever saw.", "How she longed to get out of that dark hall and wander about among those beds of bright flowers and those cool fountains, but she could not even get her head through the doorway.", "\"Oh,\" said Alice, \"how I wish I could shut up like a telescope! I think I could, if I only knew how to begin.\"", "Alice went back to the table, half hoping she might find another key on it, or at any rate, a book of rules for shutting people up like telescopes.", "This time she found a little bottle on it (\"which certainly was not here before,\" said Alice), and tied 'round the neck of the bottle was a paper label, with the words \"DRINK ME\" beautifully printed on it in large letters.", "\"No, I'll look first,\" she said, \"and see whether it's marked '_poison_' or not,\" for she had never forgotten that, if you drink from a bottle marked \"poison,\" it is almost certain to disagree with you, sooner or later.", "However, this bottle was _not_ marked \"poison,\" so Alice ventured to taste it, and, finding it very nice (it had a sort of mixed flavor of cherry-tart, custard, pineapple, roast turkey, toffy and hot buttered toast), she very soon finished it off.", "* * * * *", "\"What a curious feeling!\" said Alice.", "\"I must be shutting up like a telescope!\"", "And so it was indeed!", "She was now only ten inches high, and her face brightened up at the thought that she was now the right size for going through the little door into that lovely garden.", "After awhile, finding that nothing more happened, she decided on going into the garden at once; but, alas for poor Alice!", "When she got to the door, she found she had forgotten the little golden key, and when she went back to the table for it, she found she could not possibly reach it: she could see it quite plainly through the glass and she tried her best to climb up one of the legs of the table, but it was too slippery, and when she had tired herself out with trying, the poor little thing sat down and cried.", "\"Come, there's no use in crying like that!\" said Alice to herself rather sharply.", "\"I advise you to leave off this minute!\"", "She generally gave herself very good advice (though she very seldom followed it), and sometimes she scolded herself so severely as to bring tears into her eyes.", "Soon her eye fell on a little glass box that was lying under the table: she opened it and found in it a very small cake, on which the words \"EAT ME\" were beautifully marked in currants.", "\"Well, I'll eat it,\" said Alice, \"and if it makes me grow larger, I can reach the key; and if it makes me grow smaller, I can creep under the door: so either way I'll get into the garden, and I don't care which happens!\"", "She ate a little bit and said anxiously to herself, \"Which way? Which way?\" holding her hand on the top of her head to feel which way she was growing; and she was quite surprised to find that she remained the same size.", "So she set to work and very soon finished off the cake.", "II--THE POOL OF TEARS", "\"Curiouser and curiouser!\" cried Alice (she was so much surprised that for the moment she quite forgot how to speak good English).", "\"Now I'm opening out like the largest telescope that ever was! Good-by, feet! Oh, my poor little feet, I wonder who will put on your shoes and stockings for you now, dears? I shall be a great deal too far off to trouble myself about you.\"", "Just at this moment her head struck against the roof of the hall; in fact, she was now rather more than nine feet high, and she at once took up the little golden key and hurried off to the garden door.", "Poor Alice!", "It was as much as she could do, lying down on one side, to look through into the garden with one eye; but to get through was more hopeless than ever.", "She sat down and began to cry again.", "She went on shedding gallons of tears, until there was a large pool all 'round her and reaching half down the hall.", "After a time, she heard a little pattering of feet in the distance and she hastily dried her eyes to see what was coming.", "It was the White Rabbit returning, splendidly dressed, with a pair of white kid-gloves in one hand and a large fan in the other.", "He came trotting along in a great hurry, muttering to himself, \"Oh! the Duchess, the Duchess! Oh! _won't_ she be savage if I've kept her waiting!\"", "When the Rabbit came near her, Alice began, in a low, timid voice, \"If you please, sir--\"", "The Rabbit started violently, dropped the white kid-gloves and the fan and skurried away into the darkness as hard as he could go.", "Alice took up the fan and gloves and she kept fanning herself all the time she went on talking.", "\"Dear, dear! How queer everything is to-day! And yesterday things went on just as usual. _Was_ I the same when I got up this morning? But if I'm not the same, the next question is, 'Who in the world am I?' Ah, _that's_ the great puzzle!\"", "As she said this, she looked down at her hands and was surprised to see that she had put on one of the Rabbit's little white kid-gloves while she was talking.", "\"How _can_ I have done that?\" she thought.", "\"I must be growing small again.\"", "She got up and went to the table to measure herself by it and found that she was now about two feet high and was going on shrinking rapidly.", "She soon found out that the cause of this was the fan she was holding and she dropped it hastily, just in time to save herself from shrinking away altogether.", "\"That _was_ a narrow escape!\" said Alice, a good deal frightened at the sudden change, but very glad to find herself still in existence.", "\"And now for the garden!\"", "And she ran with all speed back to the little door; but, alas! the little door was shut again and the little golden key was lying on the glass table as before.", "\"Things are worse than ever,\" thought the poor child, \"for I never was so small as this before, never!\"", "As she said these words, her foot slipped, and in another moment, splash! she was up to her chin in salt-water.", "Her first idea was that she had somehow fallen into the sea.", "However, she soon made out that she was in the pool of tears which she had wept when she was nine feet high.", "Just then she heard something splashing about in the pool a little way off, and she swam nearer to see what it was: she soon made out that it was only a mouse that had slipped in like herself.", "\"Would it be of any use, now,\" thought Alice, \"to speak to this mouse? Everything is so out-of-the-way down here that I should think very likely it can talk; at any rate, there's no harm in trying.\"", "So she began, \"O Mouse, do you know the way out of this pool? I am very tired of swimming about here, O Mouse!\"", "The Mouse looked at her rather inquisitively and seemed to her to wink with one of its little eyes, but it said nothing.", "\"Perhaps it doesn't understand English,\" thought Alice.", "\"I dare say it's a French mouse, come over with William the Conqueror.\"", "So she began again: \"Où est ma chatte?\" which was the first sentence in her French lesson-book.", "The Mouse gave a sudden leap out of the water and seemed to quiver all over with fright.", "\"Oh, I beg your pardon!\" cried Alice hastily, afraid that she had hurt the poor animal's feelings.", "\"I quite forgot you didn't like cats.\"", "\"Not like cats!\" cried the Mouse in a shrill, passionate voice.", "\"Would _you_ like cats, if you were me?\"", "\"Well, perhaps not,\" said Alice in a soothing tone; \"don't be angry about it. And yet I wish I could show you our cat Dinah. I think you'd take a fancy to cats, if you could only see her. She is such a dear, quiet thing.\"", "The Mouse was bristling all over and she felt certain it must be really offended.", "\"We won't talk about her any more, if you'd rather not.\"", "\"We, indeed!\" cried the Mouse, who was trembling down to the end of its tail.", "\"As if _I_ would talk on such a subject! Our family always _hated_ cats--nasty, low, vulgar things! Don't let me hear the name again!\"", "\"I won't indeed!\" said Alice, in a great hurry to change the subject of conversation.", "\"Are you--are you fond--of--of dogs? There is such a nice little dog near our house, I should like to show you! It kills all the rats and--oh, dear!\" cried Alice in a sorrowful tone.", "\"I'm afraid I've offended it again!\"", "For the Mouse was swimming away from her as hard as it could go, and making quite a commotion in the pool as it went.", "So she called softly after it, \"Mouse dear! Do come back again, and we won't talk about cats, or dogs either, if you don't like them!\"", "When the Mouse heard this, it turned 'round and swam slowly back to her; its face was quite pale, and it said, in a low, trembling voice, \"Let us get to the shore and then I'll tell you my history and you'll understand why it is I hate cats and dogs.\"", "It was high time to go, for the pool was getting quite crowded with the birds and animals that had fallen into it; there were a Duck and a Dodo, a Lory and an Eaglet, and several other curious creatures.", "Alice led the way and the whole party swam to the shore.", "III--A CAUCUS-RACE AND A LONG TALE", "They were indeed a queer-looking party that assembled on the bank--the birds with draggled feathers, the animals with their fur clinging close to them, and all dripping wet, cross and uncomfortable.", "The first question, of course, was how to get dry again.", "They had a consultation about this and after a few minutes, it seemed quite natural to Alice to find herself talking familiarly with them, as if she had known them all her life.", "At last the Mouse, who seemed to be a person of some authority among them, called out, \"Sit down, all of you, and listen to me! _I'll_ soon make you dry enough!\"", "They all sat down at once, in a large ring, with the Mouse in the middle.", "\"Ahem!\" said the Mouse with an important air.", "\"Are you all ready? This is the driest thing I know. Silence all 'round, if you please! 'William the Conqueror, whose cause was favored by the pope, was soon submitted to by the English, who wanted leaders, and had been of late much accustomed to usurpation and conquest. Edwin and Morcar, the Earls of Mercia and Northumbria'--\"", "\"Ugh!\" said the Lory, with a shiver.", "\"--'And even Stigand, the patriotic archbishop of Canterbury, found it advisable'--\"", "\"Found _what_?\" said the Duck.", "\"Found _it_,\" the Mouse replied rather crossly; \"of course, you know what 'it' means.\"", "\"I know what 'it' means well enough, when _I_ find a thing,\" said the Duck; \"it's generally a frog or a worm. The question is, what did the archbishop find?\"", "The Mouse did not notice this question, but hurriedly went on, \"'--found it advisable to go with Edgar Atheling to meet William and offer him the crown.'--How are you getting on now, my dear?\" it continued, turning to Alice as it spoke.", "\"As wet as ever,\" said Alice in a melancholy tone; \"it doesn't seem to dry me at all.\"", "\"In that case,\" said the Dodo solemnly, rising to its feet, \"I move that the meeting adjourn, for the immediate adoption of more energetic remedies--\"", "\"Speak English!\" said the Eaglet.", "\"I don't know the meaning of half those long words, and, what's more, I don't believe you do either!\"", "\"What I was going to say,\" said the Dodo in an offended tone, \"is that the best thing to get us dry would be a Caucus-race.\"", "\"What _is_ a Caucus-race?\" said Alice.", "\"Why,\" said the Dodo, \"the best way to explain it is to do it.\"", "First it marked out a race-course, in a sort of circle, and then all the party were placed along the course, here and there.", "There was no \"One, two, three and away!\" but they began running when they liked and left off when they liked, so that it was not easy to know when the race was over.", "However, when they had been running half an hour or so and were quite dry again, the Dodo suddenly called out, \"The race is over!\" and they all crowded 'round it, panting and asking, \"But who has won?\"", "This question the Dodo could not answer without a great deal of thought.", "At last it said, \"_Everybody_ has won, and _all_ must have prizes.\"", "\"But who is to give the prizes?\" quite a chorus of voices asked.", "\"Why, _she_, of course,\" said the Dodo, pointing to Alice with one finger; and the whole party at once crowded 'round her, calling out, in a confused way, \"Prizes! Prizes!\"", "Alice had no idea what to do, and in despair she put her hand into her pocket and pulled out a box of comfits (luckily the salt-water had not got into it) and handed them 'round as prizes.", "There was exactly one a-piece, all 'round.", "The next thing was to eat the comfits; this caused some noise and confusion, as the large birds complained that they could not taste theirs, and the small ones choked and had to be patted on the back.", "However, it was over at last and they sat down again in a ring and begged the Mouse to tell them something more.", "\"You promised to tell me your history, you know,\" said Alice, \"and why it is you hate--C and D,\" she added in a whisper, half afraid that it would be offended again.", "\"Mine is a long and a sad tale!\" said the Mouse, turning to Alice and sighing.", "\"It _is_ a long tail, certainly,\" said Alice, looking down with wonder at the Mouse's tail, \"but why do you call it sad?\"", "And she kept on puzzling about it while the Mouse was speaking, so that her idea of the tale was something like this:--", "\"You are not attending!\" said the Mouse to Alice, severely.", "\"What are you thinking of?\"", "\"I beg your pardon,\" said Alice very humbly, \"you had got to the fifth bend, I think?\"", "\"You insult me by talking such nonsense!\" said the Mouse, getting up and walking away.", "\"Please come back and finish your story!\"", "Alice called after it.", "And the others all joined in chorus, \"Yes, please do!\"", "But the Mouse only shook its head impatiently and walked a little quicker.", "\"I wish I had Dinah, our cat, here!\" said Alice.", "This caused a remarkable sensation among the party.", "Some of the birds hurried off at once, and a Canary called out in a trembling voice, to its children, \"Come away, my dears! It's high time you were all in bed!\"", "On various pretexts they all moved off and Alice was soon left alone.", "\"I wish I hadn't mentioned Dinah! Nobody seems to like her down here and I'm sure she's the best cat in the world!\"", "Poor Alice began to cry again, for she felt very lonely and low-spirited.", "In a little while, however, she again heard a little pattering of footsteps in the distance and she looked up eagerly.", "IV--THE RABBIT SENDS IN A LITTLE BILL", "It was the White Rabbit, trotting slowly back again and looking anxiously about as it went, as if it had lost something; Alice heard it muttering to itself, \"The Duchess! The Duchess! Oh, my dear paws! Oh, my fur and whiskers! She'll get me executed, as sure as ferrets are ferrets! Where _can_ I have dropped them, I wonder?\"", "Alice guessed in a moment that it was looking for the fan and the pair of white kid-gloves and she very good-naturedly began hunting about for them, but they were nowhere to be seen--everything seemed to have changed since her swim in the pool, and the great hall, with the glass table and the little door, had vanished completely.", "Very soon the Rabbit noticed Alice, and called to her, in an angry tone, \"Why, Mary Ann, what _are_ you doing out here? Run home this moment and fetch me a pair of gloves and a fan! Quick, now!\"", "\"He took me for his housemaid!\" said Alice, as she ran off.", "\"How surprised he'll be when he finds out who I am!\"", "As she said this, she came upon a neat little house, on the door of which was a bright brass plate with the name \"W. RABBIT\" engraved upon it.", "She went in without knocking and hurried upstairs, in great fear lest she should meet the real Mary Ann and be turned out of the house before she had found the fan and gloves.", "By this time, Alice had found her way into a tidy little room with a table in the window, and on it a fan and two or three pairs of tiny white kid-gloves; she took up the fan and a pair of the gloves and was just going to leave the room, when her eyes fell upon a little bottle that stood near the looking-glass.", "She uncorked it and put it to her lips, saying to herself, \"I do hope it'll make me grow large again, for, really, I'm quite tired of being such a tiny little thing!\"", "Before she had drunk half the bottle, she found her head pressing against the ceiling, and had to stoop to save her neck from being broken.", "She hastily put down the bottle, remarking, \"That's quite enough--I hope I sha'n't grow any more.\"", "Alas!", "It was too late to wish that!", "She went on growing and growing and very soon she had to kneel down on the floor.", "Still she went on growing, and, as a last resource, she put one arm out of the window and one foot up the chimney, and said to herself, \"Now I can do no more, whatever happens. What _will_ become of me?\"", "Luckily for Alice, the little magic bottle had now had its full effect and she grew no larger.", "After a few minutes she heard a voice outside and stopped to listen.", "\"Mary Ann! Mary Ann!\" said the voice.", "\"Fetch me my gloves this moment!\"", "Then came a little pattering of feet on the stairs.", "Alice knew it was the Rabbit coming to look for her and she trembled till she shook the house, quite forgetting that she was now about a thousand times as large as the Rabbit and had no reason to be afraid of it.", "Presently the Rabbit came up to the door and tried to open it; but as the door opened inwards and Alice's elbow was pressed hard against it, that attempt proved a failure.", "Alice heard it say to itself, \"Then I'll go 'round and get in at the window.\"", "\"_That_ you won't!\" thought Alice; and after waiting till she fancied she heard the Rabbit just under the window, she suddenly spread out her hand and made a snatch in the air.", "She did not get hold of anything, but she heard a little shriek and a fall and a crash of broken glass, from which she concluded that it was just possible it had fallen into a cucumber-frame or something of that sort.", "Next came an angry voice--the Rabbit's--\"Pat! Pat! Where are you?\"", "And then a voice she had never heard before, \"Sure then, I'm here! Digging for apples, yer honor!\"", "\"Here! Come and help me out of this! Now tell me, Pat, what's that in the window?\"", "\"Sure, it's an arm, yer honor!\"", "\"Well, it's got no business there, at any rate; go and take it away!\"", "There was a long silence after this and Alice could only hear whispers now and then, and at last she spread out her hand again and made another snatch in the air.", "This time there were _two_ little shrieks and more sounds of broken glass.", "\"I wonder what they'll do next!\" thought Alice.", "\"As for pulling me out of the window, I only wish they _could_!\"", "She waited for some time without hearing anything more.", "At last came a rumbling of little cart-wheels and the sound of a good many voices all talking together.", "She made out the words: \"Where's the other ladder? Bill's got the other--Bill! Here, Bill! Will the roof bear?--Who's to go down the chimney?--Nay, _I_ sha'n't! _You_ do it! Here, Bill! The master says you've got to go down the chimney!\"", "Alice drew her foot as far down the chimney as she could and waited till she heard a little animal scratching and scrambling about in the chimney close above her; then she gave one sharp kick and waited to see what would happen next.", "The first thing she heard was a general chorus of \"There goes Bill!\" then the Rabbit's voice alone--\"Catch him, you by the hedge!\"", "Then silence and then another confusion of voices--\"Hold up his head--Brandy now--Don't choke him--What happened to you?\"", "Last came a little feeble, squeaking voice, \"Well, I hardly know--No more, thank ye. I'm better now--all I know is, something comes at me like a Jack-in-the-box and up I goes like a sky-rocket!\"", "After a minute or two of silence, they began moving about again, and Alice heard the Rabbit say, \"A barrowful will do, to begin with.\"", "\"A barrowful of _what_?\" thought Alice.", "But she had not long to doubt, for the next moment a shower of little pebbles came rattling in at the window and some of them hit her in the face.", "Alice noticed, with some surprise, that the pebbles were all turning into little cakes as they lay on the floor and a bright idea came into her head.", "\"If I eat one of these cakes,\" she thought, \"it's sure to make _some_ change in my size.\"", "So she swallowed one of the cakes and was delighted to find that she began shrinking directly.", "As soon as she was small enough to get through the door, she ran out of the house and found quite a crowd of little animals and birds waiting outside.", "They all made a rush at Alice the moment she appeared, but she ran off as hard as she could and soon found herself safe in a thick wood.", "\"The first thing I've got to do,\" said Alice to herself, as she wandered about in the wood, \"is to grow to my right size again; and the second thing is to find my way into that lovely garden. I suppose I ought to eat or drink something or other, but the great question is 'What?'\"", "Alice looked all around her at the flowers and the blades of grass, but she could not see anything that looked like the right thing to eat or drink under the circumstances.", "There was a large mushroom growing near her, about the same height as herself.", "She stretched herself up on tiptoe and peeped over the edge and her eyes immediately met those of a large blue caterpillar, that was sitting on the top, with its arms folded, quietly smoking a long hookah and taking not the smallest notice of her or of anything else.", "V--ADVICE FROM A CATERPILLAR", "At last the Caterpillar took the hookah out of its mouth and addressed Alice in a languid, sleepy voice.", "\"Who are _you_?\" said the Caterpillar.", "Alice replied, rather shyly, \"I--I hardly know, sir, just at present--at least I know who I _was_ when I got up this morning, but I think I must have changed several times since then.\"", "\"What do you mean by that?\" said the Caterpillar, sternly.", "\"Explain yourself!\"", "\"I can't explain _myself_, I'm afraid, sir,\" said Alice, \"because I'm not myself, you see--being so many different sizes in a day is very confusing.\"", "She drew herself up and said very gravely, \"I think you ought to tell me who _you_ are, first.\"", "\"Why?\" said the Caterpillar.", "As Alice could not think of any good reason and the Caterpillar seemed to be in a _very_ unpleasant state of mind, she turned away.", "\"Come back!\" the Caterpillar called after her.", "\"I've something important to say!\"", "Alice turned and came back again.", "\"Keep your temper,\" said the Caterpillar.", "\"Is that all?\" said Alice, swallowing down her anger as well as she could.", "\"No,\" said the Caterpillar.", "It unfolded its arms, took the hookah out of its mouth again, and said, \"So you think you're changed, do you?\"", "\"I'm afraid, I am, sir,\" said Alice.", "\"I can't remember things as I used--and I don't keep the same size for ten minutes together!\"", "\"What size do you want to be?\" asked the Caterpillar.", "\"Oh, I'm not particular as to size,\" Alice hastily replied, \"only one doesn't like changing so often, you know. I should like to be a _little_ larger, sir, if you wouldn't mind,\" said Alice.", "\"Three inches is such a wretched height to be.\"", "\"It is a very good height indeed!\" said the Caterpillar angrily, rearing itself upright as it spoke (it was exactly three inches high).", "In a minute or two, the Caterpillar got down off the mushroom and crawled away into the grass, merely remarking, as it went, \"One side will make you grow taller, and the other side will make you grow shorter.\"", "\"One side of _what_? The other side of _what_?\" thought Alice to herself.", "\"Of the mushroom,\" said the Caterpillar, just as if she had asked it aloud; and in another moment, it was out of sight.", "Alice remained looking thoughtfully at the mushroom for a minute, trying to make out which were the two sides of it.", "At last she stretched her arms 'round it as far as they would go, and broke off a bit of the edge with each hand.", "\"And now which is which?\" she said to herself, and nibbled a little of the right-hand bit to try the effect.", "The next moment she felt a violent blow underneath her chin--it had struck her foot!", "She was a good deal frightened by this very sudden change, as she was shrinking rapidly; so she set to work at once to eat some of the other bit.", "Her chin was pressed so closely against her foot that there was hardly room to open her mouth; but she did it at last and managed to swallow a morsel of the left-hand bit....", "\"Come, my head's free at last!\" said Alice; but all she could see, when she looked down, was an immense length of neck, which seemed to rise like a stalk out of a sea of green leaves that lay far below her.", "\"Where _have_ my shoulders got to? And oh, my poor hands, how is it I can't see you?\"", "She was delighted to find that her neck would bend about easily in any direction, like a serpent.", "She had just succeeded in curving it down into a graceful zigzag and was going to dive in among the leaves, when a sharp hiss made her draw back in a hurry--a large pigeon had flown into her face and was beating her violently with its wings.", "\"Serpent!\" cried the Pigeon.", "\"I'm _not_ a serpent!\" said Alice indignantly.", "\"Let me alone!\"", "\"I've tried the roots of trees, and I've tried banks, and I've tried hedges,\" the Pigeon went on, \"but those serpents! There's no pleasing them!\"", "Alice was more and more puzzled.", "\"As if it wasn't trouble enough hatching the eggs,\" said the Pigeon, \"but I must be on the look-out for serpents, night and day! And just as I'd taken the highest tree in the wood,\" continued the Pigeon, raising its voice to a shriek, \"and just as I was thinking I should be free of them at last, they must needs come wriggling down from the sky! Ugh, Serpent!\"", "\"But I'm _not_ a serpent, I tell you!\" said Alice.", "\"I'm a--I'm a--I'm a little girl,\" she added rather doubtfully, as she remembered the number of changes she had gone through that day.", "\"You're looking for eggs, I know _that_ well enough,\" said the Pigeon; \"and what does it matter to me whether you're a little girl or a serpent?\"", "\"It matters a good deal to _me_,\" said Alice hastily; \"but I'm not looking for eggs, as it happens, and if I was, I shouldn't want _yours_--I don't like them raw.\"", "\"Well, be off, then!\" said the Pigeon in a sulky tone, as it settled down again into its nest.", "Alice crouched down among the trees as well as she could, for her neck kept getting entangled among the branches, and every now and then she had to stop and untwist it.", "After awhile she remembered that she still held the pieces of mushroom in her hands, and she set to work very carefully, nibbling first at one and then at the other, and growing sometimes taller and sometimes shorter, until she had succeeded in bringing herself down to her usual height.", "It was so long since she had been anything near the right size that it felt quite strange at first.", "\"The next thing is to get into that beautiful garden--how _is_ that to be done, I wonder?\"", "As she said this, she came suddenly upon an open place, with a little house in it about four feet high.", "\"Whoever lives there,\" thought Alice, \"it'll never do to come upon them _this_ size; why, I should frighten them out of their wits!\"", "She did not venture to go near the house till she had brought herself down to nine inches high.", "VI--PIG AND PEPPER", "For a minute or two she stood looking at the house, when suddenly a footman in livery came running out of the wood (judging by his face only, she would have called him a fish)--and rapped loudly at the door with his knuckles.", "It was opened by another footman in livery, with a round face and large eyes like a frog.", "The Fish-Footman began by producing from under his arm a great letter, and this he handed over to the other, saying, in a solemn tone, \"For the Duchess. An invitation from the Queen to play croquet.\"", "The Frog-Footman repeated, in the same solemn tone, \"From the Queen. An invitation for the Duchess to play croquet.\"", "Then they both bowed low and their curls got entangled together.", "When Alice next peeped out, the Fish-Footman was gone, and the other was sitting on the ground near the door, staring stupidly up into the sky.", "Alice went timidly up to the door and knocked.", "\"There's no sort of use in knocking,\" said the Footman, \"and that for two reasons. First, because I'm on the same side of the door as you are; secondly, because they're making such a noise inside, no one could possibly hear you.\"", "And certainly there _was_ a most extraordinary noise going on within--a constant howling and sneezing, and every now and then a great crash, as if a dish or kettle had been broken to pieces.", "\"How am I to get in?\" asked Alice.", "\"_Are_ you to get in at all?\" said the Footman.", "\"That's the first question, you know.\"", "Alice opened the door and went in.", "The door led right into a large kitchen, which was full of smoke from one end to the other; the Duchess was sitting on a three-legged stool in the middle, nursing a baby; the cook was leaning over the fire, stirring a large caldron which seemed to be full of soup.", "\"There's certainly too much pepper in that soup!\"", "Alice said to herself, as well as she could for sneezing.", "Even the Duchess sneezed occasionally; and as for the baby, it was sneezing and howling alternately without a moment's pause.", "The only two creatures in the kitchen that did _not_ sneeze were the cook and a large cat, which was grinning from ear to ear.", "\"Please would you tell me,\" said Alice, a little timidly, \"why your cat grins like that?\"", "\"It's a Cheshire-Cat,\" said the Duchess, \"and that's why.\"", "\"I didn't know that Cheshire-Cats always grinned; in fact, I didn't know that cats _could_ grin,\" said Alice.", "\"You don't know much,\" said the Duchess, \"and that's a fact.\"", "Just then the cook took the caldron of soup off the fire, and at once set to work throwing everything within her reach at the Duchess and the baby--the fire-irons came first; then followed a shower of saucepans, plates and dishes.", "The Duchess took no notice of them, even when they hit her, and the baby was howling so much already that it was quite impossible to say whether the blows hurt it or not.", "\"Oh, _please_ mind what you're doing!\" cried Alice, jumping up and down in an agony of terror.", "\"Here! You may nurse it a bit, if you like!\" the Duchess said to Alice, flinging the baby at her as she spoke.", "\"I must go and get ready to play croquet with the Queen,\" and she hurried out of the room.", "Alice caught the baby with some difficulty, as it was a queer-shaped little creature and held out its arms and legs in all directions.", "\"If I don't take this child away with me,\" thought Alice, \"they're sure to kill it in a day or two. Wouldn't it be murder to leave it behind?\"", "She said the last words out loud and the little thing grunted in reply.", "\"If you're going to turn into a pig, my dear,\" said Alice, \"I'll have nothing more to do with you. Mind now!\"", "Alice was just beginning to think to herself, \"Now, what am I to do with this creature, when I get it home?\" when it grunted again so violently that Alice looked down into its face in some alarm.", "This time there could be _no_ mistake about it--it was neither more nor less than a pig; so she set the little creature down and felt quite relieved to see it trot away quietly into the wood.", "Alice was a little startled by seeing the Cheshire-Cat sitting on a bough of a tree a few yards off.", "The Cat only grinned when it saw her.", "\"Cheshire-Puss,\" began Alice, rather timidly, \"would you please tell me which way I ought to go from here?\"", "\"In _that_ direction,\" the Cat said, waving the right paw 'round, \"lives a Hatter; and in _that_ direction,\" waving the other paw, \"lives a March Hare. Visit either you like; they're both mad.\"", "\"But I don't want to go among mad people,\" Alice remarked.", "\"Oh, you can't help that,\" said the Cat; \"we're all mad here. Do you play croquet with the Queen to-day?\"", "\"I should like it very much,\" said Alice, \"but I haven't been invited yet.\"", "\"You'll see me there,\" said the Cat, and vanished.", "Alice had not gone much farther before she came in sight of the house of the March Hare; it was so large a house that she did not like to go near till she had nibbled some more of the left-hand bit of mushroom.", "VII--A MAD TEA-PARTY", "There was a table set out under a tree in front of the house, and the March Hare and the Hatter were having tea at it; a Dormouse was sitting between them, fast asleep.", "The table was a large one, but the three were all crowded together at one corner of it.", "\"No room! No room!\" they cried out when they saw Alice coming.", "\"There's _plenty_ of room!\" said Alice indignantly, and she sat down in a large arm-chair at one end of the table.", "The Hatter opened his eyes very wide on hearing this, but all he said was \"Why is a raven like a writing-desk?\"", "\"I'm glad they've begun asking riddles--I believe I can guess that,\" she added aloud.", "\"Do you mean that you think you can find out the answer to it?\" said the March Hare.", "\"Exactly so,\" said Alice.", "\"Then you should say what you mean,\" the March Hare went on.", "\"I do,\" Alice hastily replied; \"at least--at least I mean what I say--that's the same thing, you know.\"", "\"You might just as well say,\" added the Dormouse, which seemed to be talking in its sleep, \"that 'I breathe when I sleep' is the same thing as 'I sleep when I breathe!'\"", "\"It _is_ the same thing with you,\" said the Hatter, and he poured a little hot tea upon its nose.", "The Dormouse shook its head impatiently and said, without opening its eyes, \"Of course, of course; just what I was going to remark myself.\"", "\"Have you guessed the riddle yet?\" the Hatter said, turning to Alice again.", "\"No, I give it up,\" Alice replied.", "\"What's the answer?\"", "\"I haven't the slightest idea,\" said the Hatter.", "\"Nor I,\" said the March Hare.", "Alice gave a weary sigh.", "\"I think you might do something better with the time,\" she said, \"than wasting it in asking riddles that have no answers.\"", "\"Take some more tea,\" the March Hare said to Alice, very earnestly.", "\"I've had nothing yet,\" Alice replied in an offended tone, \"so I can't take more.\"", "\"You mean you can't take _less_,\" said the Hatter; \"it's very easy to take _more_ than nothing.\"", "At this, Alice got up and walked off.", "The Dormouse fell asleep instantly and neither of the others took the least notice of her going, though she looked back once or twice; the last time she saw them, they were trying to put the Dormouse into the tea-pot.", "\"At any rate, I'll never go _there_ again!\" said Alice, as she picked her way through the wood.", "\"It's the stupidest tea-party I ever was at in all my life!\"", "Just as she said this, she noticed that one of the trees had a door leading right into it.", "\"That's very curious!\" she thought.", "\"I think I may as well go in at once.\"", "And in she went.", "Once more she found herself in the long hall and close to the little glass table.", "Taking the little golden key, she unlocked the door that led into the garden.", "Then she set to work nibbling at the mushroom (she had kept a piece of it in her pocket) till she was about a foot high; then she walked down the little passage; and _then_--she found herself at last in the beautiful garden, among the bright flower-beds and the cool fountains.", "VIII--THE QUEEN'S CROQUET GROUND", "A large rose-tree stood near the entrance of the garden; the roses growing on it were white, but there were three gardeners at it, busily painting them red.", "Suddenly their eyes chanced to fall upon Alice, as she stood watching them.", "\"Would you tell me, please,\" said Alice, a little timidly, \"why you are painting those roses?\"", "Five and Seven said nothing, but looked at Two.", "Two began, in a low voice, \"Why, the fact is, you see, Miss, this here ought to have been a _red_ rose-tree, and we put a white one in by mistake; and, if the Queen was to find it out, we should all have our heads cut off, you know. So you see, Miss, we're doing our best, afore she comes, to--\"", "At this moment, Five, who had been anxiously looking across the garden, called out, \"The Queen! The Queen!\" and the three gardeners instantly threw themselves flat upon their faces.", "There was a sound of many footsteps and Alice looked 'round, eager to see the Queen.", "First came ten soldiers carrying clubs, with their hands and feet at the corners: next the ten courtiers; these were ornamented all over with diamonds.", "After these came the royal children; there were ten of them, all ornamented with hearts.", "Next came the guests, mostly Kings and Queens, and among them Alice recognized the White Rabbit.", "Then followed the Knave of Hearts, carrying the King's crown on a crimson velvet cushion; and last of all this grand procession came THE KING AND THE QUEEN OF HEARTS.", "When the procession came opposite to Alice, they all stopped and looked at her, and the Queen said severely, \"Who is this?\"", "She said it to the Knave of Hearts, who only bowed and smiled in reply.", "\"My name is Alice, so please Your Majesty,\" said Alice very politely; but she added to herself, \"Why, they're only a pack of cards, after all!\"", "\"Can you play croquet?\" shouted the Queen.", "The question was evidently meant for Alice.", "\"Yes!\" said Alice loudly.", "\"Come on, then!\" roared the Queen.", "\"It's--it's a very fine day!\" said a timid voice to Alice.", "She was walking by the White Rabbit, who was peeping anxiously into her face.", "\"Very,\" said Alice.", "\"Where's the Duchess?\"", "\"Hush! Hush!\" said the Rabbit.", "\"She's under sentence of execution.\"", "\"What for?\" said Alice.", "\"She boxed the Queen's ears--\" the Rabbit began.", "\"Get to your places!\" shouted the Queen in a voice of thunder, and people began running about in all directions, tumbling up against each other.", "However, they got settled down in a minute or two, and the game began.", "Alice thought she had never seen such a curious croquet-ground in her life; it was all ridges and furrows.", "The croquet balls were live hedgehogs, and the mallets live flamingos and the soldiers had to double themselves up and stand on their hands and feet, to make the arches.", "The players all played at once, without waiting for turns, quarrelling all the while and fighting for the hedgehogs; and in a very short time, the Queen was in a furious passion and went stamping about and shouting, \"Off with his head!\" or \"Off with her head!\" about once in a minute.", "\"They're dreadfully fond of beheading people here,\" thought Alice; \"the great wonder is that there's anyone left alive!\"", "She was looking about for some way of escape, when she noticed a curious appearance in the air.", "\"It's the Cheshire-Cat,\" she said to herself; \"now I shall have somebody to talk to.\"", "\"How are you getting on?\" said the Cat.", "\"I don't think they play at all fairly,\" Alice said, in a rather complaining tone; \"and they all quarrel so dreadfully one can't hear oneself speak--and they don't seem to have any rules in particular.\"", "\"How do you like the Queen?\" said the Cat in a low voice.", "\"Not at all,\" said Alice.", "Alice thought she might as well go back and see how the game was going on.", "So she went off in search of her hedgehog.", "The hedgehog was engaged in a fight with another hedgehog, which seemed to Alice an excellent opportunity for croqueting one of them with the other; the only difficulty was that her flamingo was gone across to the other side of the garden, where Alice could see it trying, in a helpless sort of way, to fly up into a tree.", "She caught the flamingo and tucked it away under her arm, that it might not escape again.", "Just then Alice ran across the Duchess (who was now out of prison).", "She tucked her arm affectionately into Alice's and they walked off together.", "Alice was very glad to find her in such a pleasant temper.", "She was a little startled, however, when she heard the voice of the Duchess close to her ear.", "\"You're thinking about something, my dear, and that makes you forget to talk.\"", "\"The game's going on rather better now,\" Alice said, by way of keeping up the conversation a little.", "\"'Tis so,\" said the Duchess; \"and the moral of that is--'Oh, 'tis love, 'tis love that makes the world go 'round!'\"", "\"Somebody said,\" Alice whispered, \"that it's done by everybody minding his own business!\"", "\"Ah, well! It means much the same thing,\" said the Duchess, digging her sharp little chin into Alice's shoulder, as she added \"and the moral of _that_ is--'Take care of the sense and the sounds will take care of themselves.'\"", "To Alice's great surprise, the Duchess's arm that was linked into hers began to tremble.", "Alice looked up and there stood the Queen in front of them, with her arms folded, frowning like a thunderstorm!", "\"Now, I give you fair warning,\" shouted the Queen, stamping on the ground as she spoke, \"either you or your head must be off, and that in about half no time. Take your choice!\"", "The Duchess took her choice, and was gone in a moment.", "\"Let's go on with the game,\" the Queen said to Alice; and Alice was too much frightened to say a word, but slowly followed her back to the croquet-ground.", "All the time they were playing, the Queen never left off quarreling with the other players and shouting, \"Off with his head!\" or \"Off with her head!\"", "By the end of half an hour or so, all the players, except the King, the Queen and Alice, were in custody of the soldiers and under sentence of execution.", "Then the Queen left off, quite out of breath, and walked away with Alice.", "Alice heard the King say in a low voice to the company generally, \"You are all pardoned.\"", "Suddenly the cry \"The Trial's beginning!\" was heard in the distance, and Alice ran along with the others.", "IX--WHO STOLE THE TARTS?", "The King and Queen of Hearts were seated on their throne when they arrived, with a great crowd assembled about them--all sorts of little birds and beasts, as well as the whole pack of cards: the Knave was standing before them, in chains, with a soldier on each side to guard him; and near the King was the White Rabbit, with a trumpet in one hand and a scroll of parchment in the other.", "In the very middle of the court was a table, with a large dish of tarts upon it.", "\"I wish they'd get the trial done,\" Alice thought, \"and hand 'round the refreshments!\"", "The judge, by the way, was the King and he wore his crown over his great wig.", "\"That's the jury-box,\" thought Alice; \"and those twelve creatures (some were animals and some were birds) I suppose they are the jurors.\"", "Just then the White Rabbit cried out \"Silence in the court!\"", "\"Herald, read the accusation!\" said the King.", "On this, the White Rabbit blew three blasts on the trumpet, then unrolled the parchment-scroll and read as follows:", "\"Call the first witness,\" said the King; and the White Rabbit blew three blasts on the trumpet and called out, \"First witness!\"", "The first witness was the Hatter.", "He came in with a teacup in one hand and a piece of bread and butter in the other.", "\"You ought to have finished,\" said the King.", "\"When did you begin?\"", "The Hatter looked at the March Hare, who had followed him into the court, arm in arm with the Dormouse.", "\"Fourteenth of March, I _think_ it was,\" he said.", "\"Give your evidence,\" said the King, \"and don't be nervous, or I'll have you executed on the spot.\"", "This did not seem to encourage the witness at all; he kept shifting from one foot to the other, looking uneasily at the Queen, and, in his confusion, he bit a large piece out of his teacup instead of the bread and butter.", "Just at this moment Alice felt a very curious sensation--she was beginning to grow larger again.", "The miserable Hatter dropped his teacup and bread and butter and went down on one knee.", "\"I'm a poor man, Your Majesty,\" he began.", "\"You're a _very_ poor _speaker_,\" said the King.", "\"You may go,\" said the King, and the Hatter hurriedly left the court.", "\"Call the next witness!\" said the King.", "The next witness was the Duchess's cook.", "She carried the pepper-box in her hand and the people near the door began sneezing all at once.", "\"Give your evidence,\" said the King.", "\"Sha'n't,\" said the cook.", "The King looked anxiously at the White Rabbit, who said, in a low voice, \"Your Majesty must cross-examine _this_ witness.\"", "\"Well, if I must, I must,\" the King said.", "\"What are tarts made of?\"", "\"Pepper, mostly,\" said the cook.", "For some minutes the whole court was in confusion and by the time they had settled down again, the cook had disappeared.", "\"Never mind!\" said the King, \"call the next witness.\"", "Alice watched the White Rabbit as he fumbled over the list.", "Imagine her surprise when he read out, at the top of his shrill little voice, the name \"Alice!\"", "X--ALICE'S EVIDENCE", "\"Here!\" cried Alice.", "She jumped up in such a hurry that she tipped over the jury-box, upsetting all the jurymen on to the heads of the crowd below.", "\"Oh, I _beg_ your pardon!\" she exclaimed in a tone of great dismay.", "\"The trial cannot proceed,\" said the King, \"until all the jurymen are back in their proper places--_all_,\" he repeated with great emphasis, looking hard at Alice.", "\"What do you know about this business?\" the King said to Alice.", "\"Nothing whatever,\" said Alice.", "The King then read from his book: \"Rule forty-two. _All persons more than a mile high to leave the court_.\"", "\"_I'm_ not a mile high,\" said Alice.", "\"Nearly two miles high,\" said the Queen.", "\"Well, I sha'n't go, at any rate,\" said Alice.", "The King turned pale and shut his note-book hastily.", "\"Consider your verdict,\" he said to the jury, in a low, trembling voice.", "\"There's more evidence to come yet, please Your Majesty,\" said the White Rabbit, jumping up in a great hurry.", "\"This paper has just been picked up. It seems to be a letter written by the prisoner to--to somebody.\"", "He unfolded the paper as he spoke and added, \"It isn't a letter, after all; it's a set of verses.\"", "\"Please, Your Majesty,\" said the Knave, \"I didn't write it and they can't prove that I did; there's no name signed at the end.\"", "\"You _must_ have meant some mischief, or else you'd have signed your name like an honest man,\" said the King.", "There was a general clapping of hands at this.", "\"Read them,\" he added, turning to the White Rabbit.", "There was dead silence in the court whilst the White Rabbit read out the verses.", "\"That's the most important piece of evidence we've heard yet,\" said the King.", "\"_I_ don't believe there's an atom of meaning in it,\" ventured Alice.", "\"If there's no meaning in it,\" said the King, \"that saves a world of trouble, you know, as we needn't try to find any. Let the jury consider their verdict.\"", "\"No, no!\" said the Queen.", "\"Sentence first--verdict afterwards.\"", "\"Stuff and nonsense!\" said Alice loudly.", "\"The idea of having the sentence first!\"", "\"Hold your tongue!\" said the Queen, turning purple.", "\"I won't!\" said Alice.", "\"Off with her head!\" the Queen shouted at the top of her voice.", "Nobody moved.", "\"Who cares for _you_?\" said Alice (she had grown to her full size by this time).", "\"You're nothing but a pack of cards!\"", "At this, the whole pack rose up in the air and came flying down upon her; she gave a little scream, half of fright and half of anger, and tried to beat them off, and found herself lying on the bank, with her head in the lap of her sister, who was gently brushing away some dead leaves that had fluttered down from the trees upon her face.", "\"Wake up, Alice dear!\" said her sister.", "\"Why, what a long sleep you've had!\"", "\"Oh, I've had such a curious dream!\" said Alice.", "And she told her sister, as well as she could remember them, all these strange adventures of hers that you have just been reading about.", "Alice got up and ran off, thinking while she ran, as well she might, what a wonderful dream it had been."]) ] -PDF_TEST_DATA = [ - ("This is a sentence\ncut off in the middle because pdf.", - ["This is a sentence cut off in the middle because pdf."]), - ("Organising your care early \nmeans you'll have months to build a good relationship with your midwife or doctor, ready for \nthe birth.", - ["Organising your care early means you'll have months to build a good relationship with your midwife or doctor, ready for the birth."]), - ("10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines:", - ["10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:"]), - ("• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines: \n\n1. Organise your pregnancy care early", - ["• 9. Stop smoking", "• 10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:", "1. Organise your pregnancy care early"]), - ("Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\n'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", - ["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]), - ("Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\r'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", - ["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) - ] - TESTS_WO_CLEAN = [ ("He has Ph.D.-level training", ["He has Ph.D.-level training"]), @@ -831,17 +816,9 @@ ] @pytest.mark.parametrize('text,expected_sents', TESTS_WITH_CLEAN) -def test_en_sbd_clean(text, expected_sents): +def test_en_sbd_with_clean(en_with_clean_no_span_fixture, text, expected_sents): """SBD tests from Pragmatic Segmenter needs clean:true""" - seg = pysbd.Segmenter(language="en", clean=True) - segments = seg.segment(text) - assert segments == expected_sents - -@pytest.mark.parametrize('text,expected_sents', PDF_TEST_DATA) -def test_en_pdf_type(text, expected_sents): - """SBD tests from Pragmatic Segmenter for doctype:pdf""" - seg = pysbd.Segmenter(language="en", clean=True, doc_type='pdf') - segments = seg.segment(text) + segments = en_with_clean_no_span_fixture.segment(text) assert segments == expected_sents @pytest.mark.parametrize('text,expected_sents', TESTS_WO_CLEAN) diff --git a/tests/lang/test_hindi.py b/tests/lang/test_hindi.py new file mode 100644 index 0000000..b80a923 --- /dev/null +++ b/tests/lang/test_hindi.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +import pytest + +GOLDEN_HI_RULES_TEST_CASES = [ + ("सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", + ["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"]) +] + +@pytest.mark.parametrize('text,expected_sents', GOLDEN_HI_RULES_TEST_CASES) +def test_hi_sbd(hi_default_fixture, text, expected_sents): + """Hindi language SBD tests from Pragmatic Segmenter""" + segments = hi_default_fixture.segment(text) + assert segments == expected_sents diff --git a/tests/lang/test_marathi.py b/tests/lang/test_marathi.py new file mode 100644 index 0000000..a9ad041 --- /dev/null +++ b/tests/lang/test_marathi.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +import pytest + +GOLDEN_MR_RULES_TEST_CASES = [ + ("आज दसरा आहे. आज खूप शुभ दिवस आहे.", + ["आज दसरा आहे.", "आज खूप शुभ दिवस आहे."]), + ("ढग खूप गर्जत होते; पण पाऊस पडत नव्हता.", + ["ढग खूप गर्जत होते; पण पाऊस पडत नव्हता."]), + ("रमाची परीक्षा कधी आहे? अवकाश आहे अजून.", + ["रमाची परीक्षा कधी आहे?", "अवकाश आहे अजून."]), + ("शाब्बास, असाच अभ्यास कर! आणि मग तुला नक्की यश मिळणार.", + ["शाब्बास, असाच अभ्यास कर!", "आणि मग तुला नक्की यश मिळणार."]), + ("\"आपली आपण करी स्तुती तो एक मूर्ख\" असे समर्थ रामदासस्वामी म्हणतात.", + ["\"आपली आपण करी स्तुती तो एक मूर्ख\" असे समर्थ रामदासस्वामी म्हणतात."]) +] + +@pytest.mark.parametrize('text,expected_sents', GOLDEN_MR_RULES_TEST_CASES) +def test_mr_sbd(mr_default_fixture, text, expected_sents): + """Marathi language SBD tests""" + segments = mr_default_fixture.segment(text) + assert segments == expected_sents diff --git a/tests/lang/test_spanish.py b/tests/lang/test_spanish.py new file mode 100644 index 0000000..206d7a5 --- /dev/null +++ b/tests/lang/test_spanish.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +import pytest +import pysbd + +GOLDEN_ES_RULES_TEST_CASES = [ + ("¿Cómo está hoy? Espero que muy bien.", + ["¿Cómo está hoy?", "Espero que muy bien."]), + ("¡Hola señorita! Espero que muy bien.", + ["¡Hola señorita!", "Espero que muy bien."]), + ("Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", + ["Hola Srta. Ledesma.", "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."]), + ("¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", + ["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."]), + ("«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles.", + ["«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles."] + ) +] + +ES_MORE_TEST_CASES = [ +('«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles. Pablo, ¿adónde vas? ¡¿Qué viste?!', +['«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles.', 'Pablo, ¿adónde vas?', '¡¿Qué viste?!']), + +('Admón. es administración o me equivoco.', +['Admón. es administración o me equivoco.']), + +("¡Hola Srta. Ledesma! ¿Cómo está hoy? Espero que muy bien.", +["¡Hola Srta. Ledesma!", "¿Cómo está hoy?", "Espero que muy bien."]), + +("Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", +["Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."]), + +("He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014. Gracias.", +["He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014.", "Gracias."]), + +("Núm. de tel: 351.123.465.4. Envíe mis saludos a la Sra. Rescia.", +["Núm. de tel: 351.123.465.4.", "Envíe mis saludos a la Sra. Rescia."]), + +("Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin. Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K.", +["Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin.", "Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K."]), + +("Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D.", +["Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D."]), + + +("Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \".", +["Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \"."]), + +("Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado.", +["Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado."]), + +("Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\". ¿Qué te parece?", +["Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\".", "¿Qué te parece?"]), + +("Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU..", +["Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU.."]), + +("Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\". Disponían de 1 min. para responder esa pregunta.", +["Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\".", "Disponían de 1 min. para responder esa pregunta."]), +("La temperatura del motor alcanzó los 120.5°C. Afortunadamente, pudo llegar al final de carrera.", +["La temperatura del motor alcanzó los 120.5°C.", "Afortunadamente, pudo llegar al final de carrera."]), +("El volumen del cuerpo es 3m³. ¿Cuál es la superficie de cada cara del prisma?", +["El volumen del cuerpo es 3m³.", "¿Cuál es la superficie de cada cara del prisma?"]), +("La habitación tiene 20.55m². El living tiene 50.0m².", +["La habitación tiene 20.55m².", "El living tiene 50.0m²."]), +("1°C corresponde a 33.8°F. ¿A cuánto corresponde 35°C?", +["1°C corresponde a 33.8°F.", "¿A cuánto corresponde 35°C?"]), +("Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos. De esta manera se consagró ¡Campeón mundial!", +["Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos.", "De esta manera se consagró ¡Campeón mundial!"]), +("¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", +["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."]), +("El corredor No. 103 arrivó 4°.", +["El corredor No. 103 arrivó 4°."]), +("Hoy es 27/04/2014, y es mi cumpleaños. ¿Cuándo es el tuyo?", +["Hoy es 27/04/2014, y es mi cumpleaños.", "¿Cuándo es el tuyo?"]), +("Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz. ¿Cuánto costará? Quizás $12.5.", +["Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz.", "¿Cuánto costará?", "Quizás $12.5."]), +("1 + 1 es 2. 2 + 2 es 4. El auto es de color rojo.", +["1 + 1 es 2.", "2 + 2 es 4.", "El auto es de color rojo."]), +("La máquina viajaba a 100 km/h. ¿En cuánto tiempo recorrió los 153 Km.?", +["La máquina viajaba a 100 km/h.", "¿En cuánto tiempo recorrió los 153 Km.?"]), +("Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco.", +["Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco."]) +] + +ES_CLEAN_TEST_CASES = [("\n \nCentro de Relaciones Interinstitucionales -CERI \n\nCra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia \n\nhttp://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co \n\n \n\nCERI 0908 \n \nBogotá, D.C. 6 de noviembre de 2014. \n \nSeñores: \nEMBAJADA DE UNITED KINGDOM \n \n", +["Centro de Relaciones Interinstitucionales -CERI", "Cra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia", "http://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co", "CERI 0908", "Bogotá, D.C. 6 de noviembre de 2014.", "Señores:", "EMBAJADA DE UNITED KINGDOM"]), +("N°. 1026.253.553", +["N°. 1026.253.553"]), +("\n__________________________________________________________\nEl Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad.", +["__________________________________________________________", "El Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad."]), +("• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa", +["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa"]), +("• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa \n• 11. Hola", +["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa", "• 11. Hola"]) +] + +@pytest.mark.parametrize('text,expected_sents', GOLDEN_ES_RULES_TEST_CASES) +def test_es_sbd(es_default_fixture, text, expected_sents): + """Spanish (Espanol) language SBD tests from Pragmatic Segmenter""" + segments = es_default_fixture.segment(text) + assert segments == expected_sents + +@pytest.mark.parametrize('text,expected_sents', ES_MORE_TEST_CASES) +def test_es_sbd_more_examples(es_default_fixture, text, expected_sents): + """Spanish (Espanol) language SBD tests from Pragmatic Segmenter Contributors""" + segments = es_default_fixture.segment(text) + assert segments == expected_sents + +@pytest.mark.parametrize('text,expected_sents', ES_CLEAN_TEST_CASES) +def test_es_sbd_more_examples(es_with_clean_no_span_fixture, text, expected_sents): + """Spanish (Espanol) language SBD tests from Pragmatic Segmenter Contributors""" + segments = es_with_clean_no_span_fixture.segment(text) + assert segments == expected_sents + +ES_PDF_CASE = [("\nA continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN \nSANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, \negresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por \nsu excelencia académica, actualmente cursa el programa de Maestría en \nIngeniería Industrial y se encuentra en un intercambio cultural en Bangalore – \nIndia.", +["A continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN SANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, egresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por su excelencia académica, actualmente cursa el programa de Maestría en Ingeniería Industrial y se encuentra en un intercambio cultural en Bangalore – India."])] + +@pytest.mark.parametrize('text,expected_sents', ES_PDF_CASE) +def test_es_pdf_type(text, expected_sents): + """Spanish SBD tests from Pragmatic Segmenter for doctype:pdf""" + seg = pysbd.Segmenter(language="es", clean=True, doc_type='pdf') + segments = seg.segment(text) + assert segments == expected_sents diff --git a/tests/regression/test_issues.py b/tests/regression/test_issues.py index d0f27df..3ce421e 100644 --- a/tests/regression/test_issues.py +++ b/tests/regression/test_issues.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import pytest import pysbd +from pysbd.utils import TextSpan TEST_ISSUE_DATA = [ ('#27', "This new form of generalized PDF in (9) is generic and suitable for all the fading models presented in Table I withbranches MRC reception. In section III, (9) will be used in the derivations of the unified ABER and ACC expression.", @@ -25,7 +26,40 @@ ['As an example of a different special-purpose mechanism, we have introduced a methodology for letting donors make their donations to charities conditional on donations by other donors (who, in turn, can make their donations conditional) [70].', 'We have used this mechanism to collect money for Indian Ocean Tsunami and Hurricane Katrina victims.', "We have also introduced a more general framework for negotiation when one agent's actions have a direct effect (externality) on the other agents' utilities [69].", 'Both the charities and externalities methodologies require the solution of NP-hard optimization problems in general, but there are some natural tractable cases as well as effective MIP formulations.', 'Recently, Ghosh and Mahdian [86] at Yahoo! Research extended our charities work, and based on this a web-based system for charitable donations was built at Yahoo!']), ('#39', "T stands for the vector transposition. As shown in Fig. ??", ["T stands for the vector transposition.", "As shown in Fig. ??"]), - ('#39', 'Fig. ??', ['Fig. ??']) + ('#39', 'Fig. ??', ['Fig. ??']), + ('#58', 'Rok bud.2027777983834843834843042003200220012000199919981997199619951994199319921991199019891988198042003200220012000199919981997199619951994199319921991199019891988198', + ['Rok bud.2027777983834843834843042003200220012000199919981997199619951994199319921991199019891988198042003200220012000199919981997199619951994199319921991199019891988198']) +] + +TEST_ISSUE_DATA_CHAR_SPANS = [ + ('#49', "1) The first item. 2) The second item.", + [('1) The first item. ', 0, 18), ('2) The second item.', 19, 38)] + ), + ('#49', "a. The first item. b. The second item. c. The third list item", + [ + ('a. The first item. ', 0, 18), ('b. The second item. ', 19, 38), + ('c. The third list item', 39, 61)] + ), + ('#53', "Trust in journalism is not associated with frequency of media use (except in the case of television as mentioned above), indicating that trust is not an important predictor of media use, though it might have an important impact on information processing. This counterintuitive fi nding can be explained by taking into account the fact that audiences do not watch informative content merely to inform themselves; they have other motivations that might override credibility concerns. For example, they might follow media primarily for entertainment purposes and consequently put less emphasis on the quality of the received information.As <|CITE|> have claimed, audiences tend to approach and process information differently depending on the channel; they approach television primarily for entertainment and newspapers primarily for information. This has implications for trust as well since audiences in an entertainment processing mode will be less attentive to credibility cues, such as news errors, than those in an information processing mode (Ibid.). <|CITE|> research confi rms this claim -he found that audiences tend to approach newspaper reading more actively than television viewing and that credibility assessments differ regarding whether audience members approach news actively or passively. These fi ndings can help explain why we found a weak positive correlation between television news exposure and trust in journalism. It could be that audiences turn to television not because they expect the best quality information but rather the opposite -namely, that they approach television news less critically, focus less attention on credibility concerns and, therefore, develop a higher degree of trust in journalism. The fact that those respondents who follow the commercial television channel POP TV and the tabloid Slovenske Novice exhibit a higher trust in journalistic objectivity compared to those respondents who do not follow these media is also in line with this interpretation. The topic of Janez Janša and exposure to media that are favourable to him and his SDS party is negatively connected to trust in journalism. This phenomenon can be partly explained by the elaboration likelihood model <|CITE|> , according to which highly involved individuals tend to process new information in a way that maintains and confi rms their original opinion by 1) taking information consistent with their views (information that falls within a narrow range of acceptance) as simply veridical and embracing it, and 2) judging counter-attitudinal information to be the product of biased, misguided or ill-informed sources and rejecting it <|CITE|> <|CITE|> . Highly partisan audiences will, therefore, tend to react to dissonant information by lowering the trustworthiness assessment of the source of such information.", + [('Trust in journalism is not associated with frequency of media use (except in the case of television as mentioned above), indicating that trust is not an important predictor of media use, though it might have an important impact on information processing. ', 0, 254), + ('This counterintuitive fi nding can be explained by taking into account the fact that audiences do not watch informative content merely to inform themselves; they have other motivations that might override credibility concerns. ', 255, 481), + ('For example, they might follow media primarily for entertainment purposes and consequently put less emphasis on the quality of the received information.As <|CITE|> have claimed, audiences tend to approach and process information differently depending on the channel; they approach television primarily for entertainment and newspapers primarily for information. ', 482, 843), + ('This has implications for trust as well since audiences in an entertainment processing mode will be less attentive to credibility cues, such as news errors, than those in an information processing mode (Ibid.). ', 844, 1054), + ('<|CITE|> research confi rms this claim -he found that audiences tend to approach newspaper reading more actively than television viewing and that credibility assessments differ regarding whether audience members approach news actively or passively. ', 1055, 1303), + ('These fi ndings can help explain why we found a weak positive correlation between television news exposure and trust in journalism. ', 1304, 1435), + ('It could be that audiences turn to television not because they expect the best quality information but rather the opposite -namely, that they approach television news less critically, focus less attention on credibility concerns and, therefore, develop a higher degree of trust in journalism. ', 1436, 1728), + ('The fact that those respondents who follow the commercial television channel POP TV and the tabloid Slovenske Novice exhibit a higher trust in journalistic objectivity compared to those respondents who do not follow these media is also in line with this interpretation. ', 1729, 1998), + ('The topic of Janez Janša and exposure to media that are favourable to him and his SDS party is negatively connected to trust in journalism. ', 1999, 2138), + ('This phenomenon can be partly explained by the elaboration likelihood model <|CITE|> , according to which highly involved individuals tend to process new information in a way that maintains and confi rms their original opinion by ', 2139, 2368), + ('1) taking information consistent with their views (information that falls within a narrow range of acceptance) as simply veridical and embracing it, and ', 2369, 2521), + ('2) judging counter-attitudinal information to be the product of biased, misguided or ill-informed sources and rejecting it <|CITE|> <|CITE|> . ', 2522, 2664), + ('Highly partisan audiences will, therefore, tend to react to dissonant information by lowering the trustworthiness assessment of the source of such information.', 2665, 2824)] + ), + ('#55', "She turned to him, \"This is great.\" She held the book out to show him.", + [ + ('She turned to him, "This is great." ', 0, 35), ('She held the book out to show him.', 36, 70) + ]) + ] @pytest.mark.parametrize('issue_no,text,expected_sents', TEST_ISSUE_DATA) @@ -36,3 +70,14 @@ def test_issue(issue_no, text, expected_sents): assert segments == expected_sents # clubbing sentences and matching with original text assert text == " ".join(segments) + +@pytest.mark.parametrize('issue_no,text,expected_sents_w_spans', TEST_ISSUE_DATA_CHAR_SPANS) +def test_issues_with_char_spans(issue_no, text, expected_sents_w_spans): + """pySBD issues tests from https://github.com/nipunsadvilkar/pySBD/issues/""" + seg = pysbd.Segmenter(language="en", clean=False, char_span=True) + segments = seg.segment(text) + expected_text_spans = [TextSpan(sent_w_span[0], sent_w_span[1], sent_w_span[2]) + for sent_w_span in expected_sents_w_spans] + assert segments == expected_text_spans + # clubbing sentences and matching with original text + assert text == "".join([seg.sent for seg in segments]) diff --git a/tests/sample.txt b/tests/sample.txt new file mode 100644 index 0000000..0b5f5a9 --- /dev/null +++ b/tests/sample.txt @@ -0,0 +1,106 @@ +import pysbd +seg = pysbd.Segmenter() +lines2 = open('tests/sample2.txt').read().splitlines() +for l in lines2: + segments = seg.segment(l) + if len(segments) > 1: + print(len(segments), repr(segments)) + +One sentence per line. +And another sentence on the same line. +(How about a sentence in parenthesis?) +Or a sentence with "a quote!" +'How about those pesky single quotes?' +[And not to forget about square brackets.] +And, brackets before the terminal [2]. +You know Mr. Abbreviation I told you so. +What about the med. staff here? +But the undef. abbreviation not. +And this f.e. is tricky stuff. +I.e. a little easier here. +However, e.g., should be really easy. +Three is one btw., is clear. +Their presence was detected by transformation into S. lividans. +Three subjects diagnosed as having something. +What the heck??!?! +(A) First things here. +(1) No, they go here. +[z] Last, but not least. +(vii) And the Romans, too. +Let's meet at 14.10 in N.Y.. +This happened in the U.S. last week. +Brexit: The E.U. and the U.K. are separating. +Refugees are welcome in the E.U.. +But they are thrown out of the U.K.. +And they never get to the U.S.. +The U.S. Air Force was called in. +What about the E.U. High Court? +And then there is the U.K. House of Commons. +Now only this splits: the EU. +A sentence ending in U.S. Another that will not split. +12 monkeys ran into here. +Nested +(Parenthesis. +(With words inside! +(Right.)) +(More stuff. +Uff, this is it!)) +In the Big City. +How we got an A. Mathematics . dot times. +An abbreviation at the end.. +This is a sentence terminal ellipsis... +This is another sentence terminal ellipsis.... +An easy to handle G. species mention. +Am 13. Jän. 2006 war es regnerisch. +The basis for Lester B. Pearson's policy was later. +This model was introduced by Dr. Edgar F. Codd after initial criticisms. +This quote "He said it." is actually inside. +B. Obama fas the first black US president. +A. The first assumption. +B. The second bullet. +C. The last case. +1. This is one. +2. And that is two. +3. Finally, three, too. +A 130 nm CMOS power amplifier (PA) operating at 2.4 GHz. +Its power stage is composed of a set of amplifying cells. +Specimens (n = 32) were sent for 16S rRNA PCR. +16S rRNA PCR could identify an organism in 10 of 32 cases (31.2%). +Cannabis sativa subsp. sativa at Noida was also confirmed. +Eight severely CILY-affected villages of Grand-Lahou in 2015. +Leaves, inflorescences and trunk borings were collected. +Disturbed the proper intracellular localization of TPRBK. +Moreover, the knockdown of TPRBK expression. +Elevated expression of LC3. +Importantly, immunohistochemistry analysis revealed it. +Bacterium produced 45U/mL -mannanase at 50 degrees C. +The culture conditions for high-level production. +Integration (e.g., on-chip etc.), can translate to lower cost. +The invasive capacity of S. Typhi is high. +Most pRNAs have a length of 8-15 nt, very few up to 24 nt. +The average length of pRNAs tended to increase from stationary to outgrowth conditions. +Results: In AAA, significantly enhanced mRNA expression was observed (p <= .001). +MMPs with macrophages (p = .007, p = .018, and p = .015, resp.). +And synth. muscle cells with MMPs (p = .020, p = .018, and p = .027, respectively). +(C) 2017 Company Ltd. +All rights reserved. +(C) 2017 Company B.V. +All rights reserved. +Northern blotting and RT-PCR. +C2m9 and C2m45 carried missense mutations. +The amplifier consumes total DC power of 167 uW. +The input-referred noise is 110 nV/sqrt(Hz). +Inflammation via activation of TLR4. +We also identify a role for TLR4. +Effects larger (eta(2) = .53), with cognition (eta(2) = .14) and neurocognition (eta(2) = .16). +All validations show a good approximation of the behavior of the DMFC. +In addition, a simulated application of a circuit system is explained. +Conclusions: Our data suggest CK5/6, CK7, and CK18 in the subclassification of NSCLC. +Copyright (C) 2018 S. Korgur AG, Basel. +Gelatin degradation by MMP-9. +ConclusionThis study provides clear evidence. +A sampling frequency of 780 MHz. +The figure-of-merit of the modulator is there. +Patients with prodromal DLB. +In line with the literature on DLB. +Always last, clear closing example. diff --git a/tests/sample2.txt b/tests/sample2.txt new file mode 100644 index 0000000..1da381c --- /dev/null +++ b/tests/sample2.txt @@ -0,0 +1,43 @@ +This is a sentence. This is another sentence. +This is a sentence! This is another sentence! +Is this a sentence? Is this another sentence? +This is a sentence. (This is another sentence.) +(This is a sentence.) This is another sentence. +This is a sentence. "This is another sentence." +This is a sentence. 'This is another sentence. +"This is a sentence." This is another sentence. +"This is a sentence." "This is another sentence." +{"This is a sentence."} ["This is another sentence."] +This is Mr. Motto here. And here is Mrs. Smithers. +This is Capt. Motto here. And here is Sra. Smithers. +This f.e. here. And here is med. help. +This f. e. here. And here is unknwn. help. +1. This goes first. 2. And here thereafter. +A. This goes first. B. And here thereafter. +I. This goes first. II. And here thereafter. +Who did this? I. No! Such a shame. +Brackets before the terminal [2]. You know I told you so. +Let's meet at 14.10 in N.Y.. This happened in the U.S. last week. +operating at 2.4 GHz. Its power stage +got an A. And then he +in the E.U.. But they are +resp.). Indicate +Don't splt., please! +The U.S. Air Force is here. +The basis for Lester B. Pearson's policy was later. +got an A. Mathematics was +This is abcf. 123 here. +This is no. A13 here. +This is abcf. (123) in here. +This is (Proc. ABC with Abs. Reg. Compliance) not here. +This is (Proc. ABC with Abs. Reg. Compliance) not here. +ET in the 112 ER+ patients (HR=2.79 for high CCNE1, p= .005 and .HR=1.97 for CCNE2, p= .05) is wrong. +This was shown by (A. Author et al.) a few months ago. +This is one. (Here is another view of the same. And then there is a different case here.) +This is one (Here is another view of the same. And then there is a different case here.) +What the heck? (A) First things here. +And another sentence on the same line. (How about a sentence in parenthesis?) Or a sentence with "a quote!" +Specimens (n = 32) were sent for 16S rRNA PCR. +Four patients (67%) with an average response of 3.3 mos. (range 6 wks. to 12 mos.) +Packed cells (PRBC) for less than 20,000 thousand/micro.L, repsectively. +This is Company Wag.B.H., truly. diff --git a/tests/test_char_span.py b/tests/test_char_span.py deleted file mode 100644 index 43a3b80..0000000 --- a/tests/test_char_span.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest -import pysbd -from pysbd.utils import TextSpan - -@pytest.mark.parametrize('text,expected', - [('My name is Jonas E. Smith. Please turn to p. 55.', - [TextSpan(sent='My name is Jonas E. Smith.', - start=0, end=26), - TextSpan(sent='Please turn to p. 55.', - start=27, end=48)])]) -def test_sbd_char_span(text, expected): - """Test sentences with character offsets""" - seg = pysbd.Segmenter(language="en", clean=False, char_span=True) - segments = seg.segment(text) - assert segments == expected - -@pytest.mark.xfail(raises=ValueError) -def test_sbd_clean_chart_span(): - """Test to not allow clean=True and char_span=True - """ - seg = pysbd.Segmenter(language="en", clean=True, char_span=True) - text = "

Hello

\n

This is a test. Another test.

" - seg.segment(text) diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py new file mode 100644 index 0000000..a3e96fa --- /dev/null +++ b/tests/test_cleaner.py @@ -0,0 +1,27 @@ +import pytest +from pysbd.cleaner import Cleaner +from pysbd.languages import Language + +TEST_TOBE_CLEANED_DATA = [ + ("It was a cold \nnight in the city.", "It was a cold night in the city."), + ("This is the U.S. Senate my friends. Yes. It is!", + "This is the U.S. Senate my friends. Yes. It is!") +] + +@pytest.mark.parametrize('text,expected_cleaned_sents', TEST_TOBE_CLEANED_DATA) +def test_cleaner(text, expected_cleaned_sents): + """SBD tests from Pragmatic Segmenter""" + cleaned_text = Cleaner(text, Language.get_language_code('en')).clean() + assert cleaned_text == expected_cleaned_sents + +def test_cleaner_doesnt_mutate_input(text="It was a cold \nnight in the city."): + cleaned_text = Cleaner(text, Language.get_language_code('en')).clean() + assert text == "It was a cold \nnight in the city." + +def test_cleaner_none_input(text=None): + cleaned_text = Cleaner(text, Language.get_language_code('en')).clean() + assert cleaned_text == text + +def test_cleaner_no_input(text=""): + cleaned_text = Cleaner(text, Language.get_language_code('en')).clean() + assert cleaned_text == text diff --git a/tests/test_languages.py b/tests/test_languages.py new file mode 100644 index 0000000..cb8b0c8 --- /dev/null +++ b/tests/test_languages.py @@ -0,0 +1,17 @@ +import pytest +from pysbd.languages import LANGUAGE_CODES, Language + + +def test_lang_code2instance_mapping(): + for code, language_module in LANGUAGE_CODES.items(): + assert Language.get_language_code(code) == language_module + +def test_exception_on_no_lang_code_provided(): + with pytest.raises(ValueError) as e: + Language.get_language_code('') + assert "Provide valid language ID i.e. ISO code." in str(e.value) + +def test_exception_on_unsupported_lang_code_provided(): + with pytest.raises(ValueError) as e: + Language.get_language_code('elvish') + assert "Provide valid language ID i.e. ISO code." in str(e.value) diff --git a/tests/test_segmenter.py b/tests/test_segmenter.py new file mode 100644 index 0000000..db4432e --- /dev/null +++ b/tests/test_segmenter.py @@ -0,0 +1,69 @@ +import pytest +import pysbd +from pysbd.utils import TextSpan + + +def test_no_input(pysbd_default_en_no_clean_no_span_fixture, text=""): + segments = pysbd_default_en_no_clean_no_span_fixture.segment(text) + assert segments == [] + +def test_none_input(pysbd_default_en_no_clean_no_span_fixture, text=None): + segments = pysbd_default_en_no_clean_no_span_fixture.segment(text) + assert segments == [] + +def test_newline_input(pysbd_default_en_no_clean_no_span_fixture, text="\n"): + segments = pysbd_default_en_no_clean_no_span_fixture.segment(text) + assert segments == [] + +def test_segmenter_doesnt_mutate_input(pysbd_default_en_no_clean_no_span_fixture, + text='My name is Jonas E. Smith. Please turn to p. 55.'): + segments = pysbd_default_en_no_clean_no_span_fixture.segment(text) + assert text == 'My name is Jonas E. Smith. Please turn to p. 55.' + +@pytest.mark.parametrize('text,expected', + [('My name is Jonas E. Smith. Please turn to p. 55.', + [ + ('My name is Jonas E. Smith. ', 0, 26), + ('Please turn to p. 55.', 27, 48), + ]) + ]) +def test_sbd_char_span(en_no_clean_with_span_fixture, text, expected): + """Test sentences with character offsets""" + segments = en_no_clean_with_span_fixture.segment(text) + expected_text_spans = [TextSpan(sent_w_span[0], sent_w_span[1], sent_w_span[2]) + for sent_w_span in expected] + assert segments == expected_text_spans + # clubbing sentences and matching with original text + assert text == "".join([seg.sent for seg in segments]) + +def test_exception_with_both_clean_and_span_true(): + """Test to not allow clean=True and char_span=True + """ + with pytest.raises(ValueError) as e: + seg = pysbd.Segmenter(language="en", clean=True, char_span=True) + text = "

Hello

\n

This is a test. Another test.

" + seg.segment(text) + assert str(e.value) == "char_span must be False if clean is True. "\ + "Since `clean=True` will modify original text." + +PDF_TEST_DATA = [ + ("This is a sentence\ncut off in the middle because pdf.", + ["This is a sentence cut off in the middle because pdf."]), + ("Organising your care early \nmeans you'll have months to build a good relationship with your midwife or doctor, ready for \nthe birth.", + ["Organising your care early means you'll have months to build a good relationship with your midwife or doctor, ready for the birth."]), + ("10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines:", + ["10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:"]), + ("• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines: \n\n1. Organise your pregnancy care early", + ["• 9. Stop smoking", "• 10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:", "1. Organise your pregnancy care early"]), + ("Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\n'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", + ["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]), + ("Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\r'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", + ["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]) + ] + +@pytest.mark.parametrize('text,expected_sents', PDF_TEST_DATA) +def test_en_pdf_type(text, expected_sents): + """SBD tests from Pragmatic Segmenter for doctype:pdf""" + seg = pysbd.Segmenter(language="en", clean=True, doc_type='pdf') + segments = seg.segment(text) + assert segments == expected_sents