Add count_sentences function to nlp.py (jbesomi#51)

* Add count_sentences function to nlp.py Also add tests for the function to test_nlp.py * Implement suggestions from pull request. Add more tests, change style (docstring, tests naming). Remove unicode-casting to avoid unexpected behaviour. * Add link to spacy documentation. Additionally update index tests, they're cleaner now. Co-authored-by: Henri Froese <[email protected]>
SummerOfCode-NoHate · Jul 12, 2020 · d50559d · d50559d
1 parent 81411c2
commit d50559d
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 2 deletions.
diff --git a/tests/test_nlp.py b/tests/test_nlp.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import numpy as np
 from texthero import nlp
 
 from . import PandasTestCase
@@ -36,3 +37,34 @@ def test_noun_chunks(self):
             [[("Today", "NP", 0, 5), ("such a beautiful day", "NP", 9, 29)]]
         )
         self.assertEqual(nlp.noun_chunks(s), s_true)
+
+    """
+    Count sentences.
+    """
+
+    def test_count_sentences(self):
+        s = pd.Series("I think ... it counts correctly. Doesn't it? Great!")
+        s_true = pd.Series(3)
+        self.assertEqual(nlp.count_sentences(s), s_true)
+
+    def test_count_sentences_numeric(self):
+        s = pd.Series([13.0, 42.0])
+        self.assertRaises(TypeError, nlp.count_sentences, s)
+
+    def test_count_sentences_missing_value(self):
+        s = pd.Series(["Test.", np.nan])
+        self.assertRaises(TypeError, nlp.count_sentences, s)
+
+    def test_count_sentences_index(self):
+        s = pd.Series(["Test"], index=[5])
+        counted_sentences_s = nlp.count_sentences(s)
+        t_same_index = pd.Series([""], index=[5])
+
+        self.assertTrue(counted_sentences_s.index.equals(t_same_index.index))
+
+    def test_count_sentences_wrong_index(self):
+        s = pd.Series(["Test", "Test"], index=[5, 6])
+        counted_sentences_s = nlp.count_sentences(s)
+        t_different_index = pd.Series(["", ""], index=[5, 7])
+
+        self.assertFalse(counted_sentences_s.index.equals(t_different_index.index))
diff --git a/texthero/nlp.py b/texthero/nlp.py
@@ -11,11 +11,11 @@ def named_entities(s, package="spacy"):
     Return named-entities.
 
     Return a Pandas Series where each rows contains a list of tuples containing information regarding the given named entities.
-    
+
     Tuple: (`entity'name`, `entity'label`, `starting character`, `ending character`)
 
     Under the hood, `named_entities` make use of Spacy name entity recognition.
-    
+
     List of labels:
      - `PERSON`: People, including fictional.
      - `NORP`: Nationalities or religious or political groups.
@@ -76,3 +76,33 @@ def noun_chunks(s):
         )
 
     return pd.Series(noun_chunks, index=s.index)
+
+
+def count_sentences(s: pd.Series) -> pd.Series:
+    """
+    Count the number of sentences per cell in a Pandas Series.
+
+    Return a new Pandas Series with the number of sentences per cell.
+
+    This makes use of the SpaCy `sentencizer <https://spacy.io/api/sentencizer>`.
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(["Yesterday I was in NY with Bill de Blasio. Great story...", "This is the F.B.I.! What? Open up!"])
+    >>> hero.count_sentences(s)
+    0    2
+    1    3
+    dtype: int64
+    """
+    number_of_sentences = []
+
+    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))  # Pipe is only "sentencizer"
+
+    for doc in nlp.pipe(s.values, batch_size=32):
+        sentences = len(list(doc.sents))
+        number_of_sentences.append(sentences)
+
+    return pd.Series(number_of_sentences, index=s.index)