Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RepresentationSeries: pca, nmf, tsne #140

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
s_tokenized_lists = pd.Series([["Test", "Test2"], ["Test3"]], index=[5, 6])
s_numeric = pd.Series([5.0], index=[5])
s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6])
s_representation_vectors = pd.Series(
[1.0, 0.0, 0.0, 1.0],
index=pd.MultiIndex.from_tuples([(5, "A"), (5, "B"), (7, "A"), (7, "C")]),
)


# Define all test cases. Every test case is a list
# of [name of test case, function to test, tuple of valid input for the function].
Expand Down Expand Up @@ -71,9 +76,9 @@
lambda x: representation.flatten(representation.tfidf(x)),
(s_tokenized_lists,),
],
["pca", representation.pca, (s_numeric_lists, 0)],
["nmf", representation.nmf, (s_numeric_lists,)],
["tsne", representation.tsne, (s_numeric_lists,)],
["pca", representation.pca, (s_representation_vectors,),],
["nmf", representation.nmf, (s_representation_vectors,),],
["tsne", representation.tsne, (s_representation_vectors,),],
["kmeans", representation.kmeans, (s_numeric_lists, 1)],
["dbscan", representation.dbscan, (s_numeric_lists,)],
["meanshift", representation.meanshift, (s_numeric_lists,)],
Expand Down Expand Up @@ -107,7 +112,10 @@ def test_correct_index(self, name, test_function, valid_input):
s = valid_input[0]
result_s = test_function(*valid_input)
t_same_index = pd.Series(s.values, s.index)
self.assertTrue(result_s.index.equals(t_same_index.index))
if isinstance(s.index, pd.MultiIndex): # if Representation Series
self.assertTrue(result_s.index.equals(t_same_index.index.levels[0]))
else:
self.assertTrue(result_s.index.equals(t_same_index.index))

@parameterized.expand(test_cases)
def test_incorrect_index(self, name, test_function, valid_input):
Expand Down
48 changes: 47 additions & 1 deletion tests/test_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,20 @@ def _tfidf(term, corpus, document_index):

s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],)

s_representation_vectors_index = pd.MultiIndex.from_tuples(
[(5, "A"), (5, "B"), (7, "A"), (7, "C")]
)

s_representation_vectors = pd.Series(
[1.0, 0.0, 0.0, 1.0], index=s_representation_vectors_index
)

s_flat_vectors_index = pd.Index([5, 7])

s_flat_vectors = pd.Series(
[[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]], index=s_flat_vectors_index
)


test_cases_vectorization = [
# format: [function_name, function, correct output for tokenized input above, dtype of output]
Expand All @@ -85,6 +99,16 @@ def _tfidf(term, corpus, document_index):
["tfidf", representation.tfidf, [2.0, 1.0], "float",],
]

test_cases_dim_reduction = [
# format: [function_name, function, correct output for numeric input above, dtype of output]
[
"pca",
representation.pca,
[[0.7071067811865475, 0.0], [-0.7071067811865475, 0.0]],
],
["nmf", representation.nmf, [[0.0, 1.0], [1.0, 0.0]]],
]


class AbstractRepresentationTest(PandasTestCase):
"""
Expand Down Expand Up @@ -163,7 +187,29 @@ def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *arg
@parameterized.expand(test_cases_vectorization)
def test_vectorization_arguments_to_sklearn(self, name, test_function, *args):
try:
test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0)
test_function(s_tokenized, max_features=1, min_df=1, max_df=1.0)
except TypeError:
self.fail("Sklearn arguments not handled correctly.")

"""
Dimensionality Reduction
"""

@parameterized.expand(test_cases_dim_reduction)
def test_dim_reduction_simple_with_index(
self, name, test_function, correct_output_values
):
s_true = pd.Series(correct_output_values, index=s_flat_vectors_index)

result_s = test_function(s_representation_vectors, random_state=42)

# check_less_precise True to prevent rounding errors from giving a Failure.
pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True)

@parameterized.expand(test_cases_dim_reduction)
def test_dim_reduction_arguments_to_sklearn(self, name, test_function, *args):
try:
test_function(s_representation_vectors, n_components=2, random_state=42)
except TypeError:
self.fail("Sklearn arguments not handled correctly.")

Expand Down
104 changes: 100 additions & 4 deletions texthero/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,17 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
are easily visible. The corpus can now be visualized in 3D and we can
get a good first view of the data!

Be careful: PCA can *not* handle sparse input, so even when calling PCA with
a very sparse Representation Series, internally texthero will compute
the whole dense representation, so if you're working with big datasets,
you should probably use :meth:`texthero.representation.nmf` or
:meth:`texthero.representation.tsne` as they can handle sparse input.

In general, *pca* should be called after the text has already been represented to a matrix form.

The input has to be a Representation Series.
TODO add typing module link

Parameters
----------
s : Pandas Series
Expand Down Expand Up @@ -466,9 +475,40 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
--------
`PCA on Wikipedia <https://en.wikipedia.org/wiki/Principal_component_analysis>`_

Representation Series: TODO add tutorial link and typing module link

"""
pca = PCA(n_components=n_components, random_state=random_state, copy=False)
return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index)

if _check_is_valid_representation(s):

if pd.api.types.is_sparse(s):
s_coo_matrix = s.sparse.to_coo()[0]
if s_coo_matrix.shape[1] > 1000:
warnings.warn(
"Be careful. You are trying to compute PCA from a Sparse Pandas Series with a very large vocabulary."
" Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix."
" This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently."
)
else:
# Treat it as a Sparse matrix anyway for efficiency.
s = s.astype("Sparse")
s_coo_matrix = s.sparse.to_coo()[0]

s_for_vectorization = s_coo_matrix.todense() # PCA cannot handle sparse input.

# Else: no Representation Series -> fail
else:
raise ValueError(
f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
)

s_out = pd.Series(
pca.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0),
)
s_out = s_out.rename_axis(None)

return s_out


def nmf(s, n_components=2, random_state=None) -> pd.Series:
Expand All @@ -488,6 +528,8 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
and calculate a vector for each document that places it
correctly among the topics.

The input has to be a Representation Series.
TODO add tutorial link

Parameters
----------
Expand Down Expand Up @@ -525,9 +567,35 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
--------
`NMF on Wikipedia <https://en.wikipedia.org/wiki/Non-negative_matrix_factorization>`_

Representation Series: TODO add tutorial link and typing module link
"""
nmf = NMF(n_components=n_components, init="random", random_state=random_state,)
return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index)

nmf = NMF(n_components=n_components, init=None, random_state=random_state)

if _check_is_valid_representation(s):

if pd.api.types.is_sparse(s):
s_coo_matrix = s.sparse.to_coo()[0]
else:
# Treat it as a Sparse matrix anyway for efficiency.
s = s.astype("Sparse")
s_coo_matrix = s.sparse.to_coo()[0]

s_for_vectorization = s_coo_matrix # NMF can work with sparse input.

# Else: no Representation Series -> fail
else:
raise ValueError(
f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
)

s_out = pd.Series(
nmf.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0),
)

s_out = s_out.rename_axis(None)

return s_out


def tsne(
Expand All @@ -554,6 +622,8 @@ def tsne(
vector in such a way that the differences / similarities between
documents are preserved.

The input has to be a Representation Series.
TODO add typing module link

Parameters
----------
Expand Down Expand Up @@ -610,6 +680,8 @@ def tsne(
--------
`t-SNE on Wikipedia <https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding>`_

Representation Series: TODO add tutorial link and typing module link

"""
tsne = TSNE(
n_components=n_components,
Expand All @@ -619,7 +691,31 @@ def tsne(
random_state=random_state,
n_jobs=n_jobs,
)
return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index)

if _check_is_valid_representation(s):

if pd.api.types.is_sparse(s):
s_coo_matrix = s.sparse.to_coo()[0]
else:
# Treat it as a Sparse matrix anyway for efficiency.
s = s.astype("Sparse")
s_coo_matrix = s.sparse.to_coo()[0]

s_for_vectorization = s_coo_matrix # TSNE can work with sparse input.

# Else: no Representation Series -> fail
else:
raise ValueError(
f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
)

s_out = pd.Series(
tsne.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0)
)

s_out = s_out.rename_axis(None)

return s_out


"""
Expand Down
2 changes: 1 addition & 1 deletion texthero/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def scatterplot(
>>> import pandas as pd
>>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"])
>>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize)
>>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten
>>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3)
>>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten
>>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP
"""
Expand Down