Improve documentation and remove lowercase args from representation

jbesomi · Jun 1, 2020 · 84fabd9 · 84fabd9
1 parent 8d7b701
commit 84fabd9
Show file tree

Hide file tree

Showing 6 changed files with 470 additions and 142 deletions.
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -193,17 +193,17 @@ def test_has_content(self):
 
     def test_remove_urls(self):
         s = pd.Series("http://tests.com http://www.tests.com")
-        s_true = pd.Series(" ")
+        s_true = pd.Series("   ")
         self.assertEqual(preprocessing.remove_urls(s), s_true)
 
     def test_remove_urls_https(self):
         s = pd.Series("https://tests.com https://www.tests.com")
-        s_true = pd.Series(" ")
+        s_true = pd.Series("   ")
         self.assertEqual(preprocessing.remove_urls(s), s_true)
 
     def test_remove_urls_multiline(self):
         s = pd.Series("https://tests.com \n https://tests.com")
-        s_true = pd.Series(" \n ")
+        s_true = pd.Series("  \n  ")
         self.assertEqual(preprocessing.remove_urls(s), s_true)
 
     """

diff --git a/tests/test_representation.py b/tests/test_representation.py
@@ -37,15 +37,10 @@ def test_term_frequency_not_lowercase(self):
         s_true = pd.Series([[1, 1]])
         self.assertEqual(representation.term_frequency(s), s_true)
 
-    def test_term_frequency_lowercase(self):
-        s = pd.Series(["one ONE"])
-        s_true = pd.Series([[2]])
-        self.assertEqual(representation.term_frequency(s, lowercase=True), s_true)
-
     def test_term_frequency_punctuation_are_kept(self):
         s = pd.Series(["one !"])
         s_true = pd.Series([[1, 1]])
-        self.assertEqual(representation.term_frequency(s, lowercase=True), s_true)
+        self.assertEqual(representation.term_frequency(s), s_true)
 
     """
     TF-IDF
@@ -62,8 +57,3 @@ def test_idf_single_not_lowercase(self):
         s = pd.Series("ONE one")
         s_true = pd.Series([[tfidf_single_smooth, tfidf_single_smooth]])
         self.assertEqual(representation.tfidf(s), s_true)
-
-    def test_idf_single_lowercase(self):
-        s = pd.Series("ONE one")
-        s_true = pd.Series([[1.0]])
-        self.assertEqual(representation.tfidf(s, lowercase=True), s_true)
diff --git a/texthero/nlp.py b/texthero/nlp.py
@@ -10,27 +10,39 @@ def named_entities(s, package="spacy"):
     """
     Return named-entities.
 
-    Use Spacy named-entity-recognition.
-
-        PERSON: People, including fictional.
-        NORP: Nationalities or religious or political groups.
-        FAC: Buildings, airports, highways, bridges, etc.
-        ORG: Companies, agencies, institutions, etc.
-        GPE: Countries, cities, states.
-        LOC: Non-GPE locations, mountain ranges, bodies of water.
-        PRODUCT: Objects, vehicles, foods, etc. (Not services.)
-        EVENT: Named hurricanes, battles, wars, sports events, etc.
-        WORK_OF_ART: Titles of books, songs, etc.
-        LAW: Named documents made into laws.
-        LANGUAGE: Any named language.
-        DATE: Absolute or relative dates or periods.
-        TIME: Times smaller than a day.
-        PERCENT: Percentage, including ”%“.
-        MONEY: Monetary values, including unit.
-        QUANTITY: Measurements, as of weight or distance.
-        ORDINAL: “first”, “second”, etc.
-        CARDINAL:	Numerals that do not fall under another type.
+    Return a Pandas Series where each rows contains a list of tuples containing information regarding the given named entities.
+    
+    Tuple: (`entity'name`, `entity'label`, `starting character`, `ending character`)
 
+    Under the hood, `named_entities` make use of Spacy name entity recognition.
+    
+    List of labels:
+     - `PERSON`: People, including fictional.
+     - `NORP`: Nationalities or religious or political groups.
+     - `FAC`: Buildings, airports, highways, bridges, etc.
+     - `ORG` : Companies, agencies, institutions, etc.
+     - `GPE`: Countries, cities, states.
+     - `LOC`: Non-GPE locations, mountain ranges, bodies of water.
+     - `PRODUCT`: Objects, vehicles, foods, etc. (Not services.)
+     - `EVENT`: Named hurricanes, battles, wars, sports events, etc.
+     - `WORK_OF_ART`: Titles of books, songs, etc.
+     - `LAW`: Named documents made into laws.
+     - `LANGUAGE`: Any named language.
+     - `DATE`: Absolute or relative dates or periods.
+     - `TIME`: Times smaller than a day.
+     - `PERCENT`: Percentage, including ”%“.
+     - `MONEY`: Monetary values, including unit.
+     - `QUANTITY`: Measurements, as of weight or distance.
+     - `ORDINAL`: “first”, “second”, etc.
+     - `CARDINAL`: Numerals that do not fall under another type.
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series("Yesterday I was in NY with Bill de Blasio")
+    >>> hero.named_entities(s)[0]
+    [('Yesterday', 'DATE', 0, 9), ('NY', 'GPE', 19, 21), ('Bill de Blasio', 'PERSON', 27, 41)]
     """
     entities = []
 
@@ -47,11 +59,9 @@ def named_entities(s, package="spacy"):
 
 def noun_chunks(s):
     """
-    Return noun_chunks, flat phrases that have a noun as their head.
-
-    Return group of consecutive words that belong together.
-    
+    Return noun_chunks, group of consecutive words that belong together.
     """
+
     noun_chunks = []
 
     nlp = spacy.load("en_core_web_sm", disable=["ner"])
@@ -66,11 +76,3 @@ def noun_chunks(s):
         )
 
     return pd.Series(noun_chunks, index=s.index)
-
-
-def dependency_parse(s):
-    """
-    Return the dependency parse
-    
-    """
-    return NotImplemented