Skip to content

Commit

Permalink
Add count_sentences function to nlp.py (jbesomi#51)
Browse files Browse the repository at this point in the history
* Add count_sentences function to nlp.py

Also add tests for the function to test_nlp.py

* Implement suggestions from pull request.

Add more tests, change style (docstring, tests naming).
Remove unicode-casting to avoid unexpected behaviour.

* Add link to spacy documentation.

Additionally update index tests, they're cleaner now.

Co-authored-by: Henri Froese <[email protected]>
  • Loading branch information
henrifroese and henrifroese committed Jul 12, 2020
1 parent 81411c2 commit d50559d
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 2 deletions.
32 changes: 32 additions & 0 deletions tests/test_nlp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import numpy as np
from texthero import nlp

from . import PandasTestCase
Expand Down Expand Up @@ -36,3 +37,34 @@ def test_noun_chunks(self):
[[("Today", "NP", 0, 5), ("such a beautiful day", "NP", 9, 29)]]
)
self.assertEqual(nlp.noun_chunks(s), s_true)

"""
Count sentences.
"""

def test_count_sentences(self):
s = pd.Series("I think ... it counts correctly. Doesn't it? Great!")
s_true = pd.Series(3)
self.assertEqual(nlp.count_sentences(s), s_true)

def test_count_sentences_numeric(self):
s = pd.Series([13.0, 42.0])
self.assertRaises(TypeError, nlp.count_sentences, s)

def test_count_sentences_missing_value(self):
s = pd.Series(["Test.", np.nan])
self.assertRaises(TypeError, nlp.count_sentences, s)

def test_count_sentences_index(self):
s = pd.Series(["Test"], index=[5])
counted_sentences_s = nlp.count_sentences(s)
t_same_index = pd.Series([""], index=[5])

self.assertTrue(counted_sentences_s.index.equals(t_same_index.index))

def test_count_sentences_wrong_index(self):
s = pd.Series(["Test", "Test"], index=[5, 6])
counted_sentences_s = nlp.count_sentences(s)
t_different_index = pd.Series(["", ""], index=[5, 7])

self.assertFalse(counted_sentences_s.index.equals(t_different_index.index))
34 changes: 32 additions & 2 deletions texthero/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ def named_entities(s, package="spacy"):
Return named-entities.
Return a Pandas Series where each rows contains a list of tuples containing information regarding the given named entities.
Tuple: (`entity'name`, `entity'label`, `starting character`, `ending character`)
Under the hood, `named_entities` make use of Spacy name entity recognition.
List of labels:
- `PERSON`: People, including fictional.
- `NORP`: Nationalities or religious or political groups.
Expand Down Expand Up @@ -76,3 +76,33 @@ def noun_chunks(s):
)

return pd.Series(noun_chunks, index=s.index)


def count_sentences(s: pd.Series) -> pd.Series:
"""
Count the number of sentences per cell in a Pandas Series.
Return a new Pandas Series with the number of sentences per cell.
This makes use of the SpaCy `sentencizer <https://spacy.io/api/sentencizer>`.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Yesterday I was in NY with Bill de Blasio. Great story...", "This is the F.B.I.! What? Open up!"])
>>> hero.count_sentences(s)
0 2
1 3
dtype: int64
"""
number_of_sentences = []

nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
nlp.add_pipe(nlp.create_pipe("sentencizer")) # Pipe is only "sentencizer"

for doc in nlp.pipe(s.values, batch_size=32):
sentences = len(list(doc.sents))
number_of_sentences.append(sentences)

return pd.Series(number_of_sentences, index=s.index)

0 comments on commit d50559d

Please sign in to comment.