Skip to content

Commit

Permalink
Improve documentation and remove lowercase args from representation
Browse files Browse the repository at this point in the history
  • Loading branch information
jbesomi committed Jun 1, 2020
1 parent 8d7b701 commit 84fabd9
Show file tree
Hide file tree
Showing 6 changed files with 470 additions and 142 deletions.
6 changes: 3 additions & 3 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,17 +193,17 @@ def test_has_content(self):

def test_remove_urls(self):
s = pd.Series("http://tests.com http://www.tests.com")
s_true = pd.Series(" ")
s_true = pd.Series(" ")
self.assertEqual(preprocessing.remove_urls(s), s_true)

def test_remove_urls_https(self):
s = pd.Series("https://tests.com https://www.tests.com")
s_true = pd.Series(" ")
s_true = pd.Series(" ")
self.assertEqual(preprocessing.remove_urls(s), s_true)

def test_remove_urls_multiline(self):
s = pd.Series("https://tests.com \n https://tests.com")
s_true = pd.Series(" \n ")
s_true = pd.Series(" \n ")
self.assertEqual(preprocessing.remove_urls(s), s_true)

"""
Expand Down
12 changes: 1 addition & 11 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,10 @@ def test_term_frequency_not_lowercase(self):
s_true = pd.Series([[1, 1]])
self.assertEqual(representation.term_frequency(s), s_true)

def test_term_frequency_lowercase(self):
s = pd.Series(["one ONE"])
s_true = pd.Series([[2]])
self.assertEqual(representation.term_frequency(s, lowercase=True), s_true)

def test_term_frequency_punctuation_are_kept(self):
s = pd.Series(["one !"])
s_true = pd.Series([[1, 1]])
self.assertEqual(representation.term_frequency(s, lowercase=True), s_true)
self.assertEqual(representation.term_frequency(s), s_true)

"""
TF-IDF
Expand All @@ -62,8 +57,3 @@ def test_idf_single_not_lowercase(self):
s = pd.Series("ONE one")
s_true = pd.Series([[tfidf_single_smooth, tfidf_single_smooth]])
self.assertEqual(representation.tfidf(s), s_true)

def test_idf_single_lowercase(self):
s = pd.Series("ONE one")
s_true = pd.Series([[1.0]])
self.assertEqual(representation.tfidf(s, lowercase=True), s_true)
66 changes: 34 additions & 32 deletions texthero/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,39 @@ def named_entities(s, package="spacy"):
"""
Return named-entities.
Use Spacy named-entity-recognition.
PERSON: People, including fictional.
NORP: Nationalities or religious or political groups.
FAC: Buildings, airports, highways, bridges, etc.
ORG: Companies, agencies, institutions, etc.
GPE: Countries, cities, states.
LOC: Non-GPE locations, mountain ranges, bodies of water.
PRODUCT: Objects, vehicles, foods, etc. (Not services.)
EVENT: Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART: Titles of books, songs, etc.
LAW: Named documents made into laws.
LANGUAGE: Any named language.
DATE: Absolute or relative dates or periods.
TIME: Times smaller than a day.
PERCENT: Percentage, including ”%“.
MONEY: Monetary values, including unit.
QUANTITY: Measurements, as of weight or distance.
ORDINAL: “first”, “second”, etc.
CARDINAL: Numerals that do not fall under another type.
Return a Pandas Series where each rows contains a list of tuples containing information regarding the given named entities.
Tuple: (`entity'name`, `entity'label`, `starting character`, `ending character`)
Under the hood, `named_entities` make use of Spacy name entity recognition.
List of labels:
- `PERSON`: People, including fictional.
- `NORP`: Nationalities or religious or political groups.
- `FAC`: Buildings, airports, highways, bridges, etc.
- `ORG` : Companies, agencies, institutions, etc.
- `GPE`: Countries, cities, states.
- `LOC`: Non-GPE locations, mountain ranges, bodies of water.
- `PRODUCT`: Objects, vehicles, foods, etc. (Not services.)
- `EVENT`: Named hurricanes, battles, wars, sports events, etc.
- `WORK_OF_ART`: Titles of books, songs, etc.
- `LAW`: Named documents made into laws.
- `LANGUAGE`: Any named language.
- `DATE`: Absolute or relative dates or periods.
- `TIME`: Times smaller than a day.
- `PERCENT`: Percentage, including ”%“.
- `MONEY`: Monetary values, including unit.
- `QUANTITY`: Measurements, as of weight or distance.
- `ORDINAL`: “first”, “second”, etc.
- `CARDINAL`: Numerals that do not fall under another type.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series("Yesterday I was in NY with Bill de Blasio")
>>> hero.named_entities(s)[0]
[('Yesterday', 'DATE', 0, 9), ('NY', 'GPE', 19, 21), ('Bill de Blasio', 'PERSON', 27, 41)]
"""
entities = []

Expand All @@ -47,11 +59,9 @@ def named_entities(s, package="spacy"):

def noun_chunks(s):
"""
Return noun_chunks, flat phrases that have a noun as their head.
Return group of consecutive words that belong together.
Return noun_chunks, group of consecutive words that belong together.
"""

noun_chunks = []

nlp = spacy.load("en_core_web_sm", disable=["ner"])
Expand All @@ -66,11 +76,3 @@ def noun_chunks(s):
)

return pd.Series(noun_chunks, index=s.index)


def dependency_parse(s):
"""
Return the dependency parse
"""
return NotImplemented
Loading

0 comments on commit 84fabd9

Please sign in to comment.