jbesomi · henrifroese · Aug 5, 2020 · Aug 7, 2020
diff --git a/tests/test_indexes.py b/tests/test_indexes.py
@@ -12,6 +12,11 @@
 s_tokenized_lists = pd.Series([["Test", "Test2"], ["Test3"]], index=[5, 6])
 s_numeric = pd.Series([5.0], index=[5])
 s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6])
+s_representation_vectors = pd.Series(
+    [1.0, 0.0, 0.0, 1.0],
+    index=pd.MultiIndex.from_tuples([(5, "A"), (5, "B"), (7, "A"), (7, "C")]),
+)
+
 
 # Define all test cases. Every test case is a list
 # of [name of test case, function to test, tuple of valid input for the function].
@@ -71,9 +76,9 @@
         lambda x: representation.flatten(representation.tfidf(x)),
         (s_tokenized_lists,),
     ],
-    ["pca", representation.pca, (s_numeric_lists, 0)],
-    ["nmf", representation.nmf, (s_numeric_lists,)],
-    ["tsne", representation.tsne, (s_numeric_lists,)],
+    ["pca", representation.pca, (s_representation_vectors,),],
+    ["nmf", representation.nmf, (s_representation_vectors,),],
+    ["tsne", representation.tsne, (s_representation_vectors,),],
     ["kmeans", representation.kmeans, (s_numeric_lists, 1)],
     ["dbscan", representation.dbscan, (s_numeric_lists,)],
     ["meanshift", representation.meanshift, (s_numeric_lists,)],
@@ -107,7 +112,10 @@ def test_correct_index(self, name, test_function, valid_input):
         s = valid_input[0]
         result_s = test_function(*valid_input)
         t_same_index = pd.Series(s.values, s.index)
-        self.assertTrue(result_s.index.equals(t_same_index.index))
+        if isinstance(s.index, pd.MultiIndex):  # if Representation Series
+            self.assertTrue(result_s.index.equals(t_same_index.index.levels[0]))
+        else:
+            self.assertTrue(result_s.index.equals(t_same_index.index))
 
     @parameterized.expand(test_cases)
     def test_incorrect_index(self, name, test_function, valid_input):

diff --git a/tests/test_representation.py b/tests/test_representation.py
@@ -60,6 +60,20 @@ def _tfidf(term, corpus, document_index):
 
 s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],)
 
+s_representation_vectors_index = pd.MultiIndex.from_tuples(
+    [(5, "A"), (5, "B"), (7, "A"), (7, "C")]
+)
+
+s_representation_vectors = pd.Series(
+    [1.0, 0.0, 0.0, 1.0], index=s_representation_vectors_index
+)
+
+s_flat_vectors_index = pd.Index([5, 7])
+
+s_flat_vectors = pd.Series(
+    [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]], index=s_flat_vectors_index
+)
+
 
 test_cases_vectorization = [
     # format: [function_name, function, correct output for tokenized input above, dtype of output]
@@ -85,6 +99,16 @@ def _tfidf(term, corpus, document_index):
     ["tfidf", representation.tfidf, [2.0, 1.0], "float",],
 ]
 
+test_cases_dim_reduction = [
+    # format: [function_name, function, correct output for numeric input above, dtype of output]
+    [
+        "pca",
+        representation.pca,
+        [[0.7071067811865475, 0.0], [-0.7071067811865475, 0.0]],
+    ],
+    ["nmf", representation.nmf, [[0.0, 1.0], [1.0, 0.0]]],
+]
+
 
 class AbstractRepresentationTest(PandasTestCase):
     """
@@ -163,7 +187,29 @@ def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *arg
     @parameterized.expand(test_cases_vectorization)
     def test_vectorization_arguments_to_sklearn(self, name, test_function, *args):
         try:
-            test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0)
+            test_function(s_tokenized, max_features=1, min_df=1, max_df=1.0)
+        except TypeError:
+            self.fail("Sklearn arguments not handled correctly.")
+
+    """
+    Dimensionality Reduction
+    """
+
+    @parameterized.expand(test_cases_dim_reduction)
+    def test_dim_reduction_simple_with_index(
+        self, name, test_function, correct_output_values
+    ):
+        s_true = pd.Series(correct_output_values, index=s_flat_vectors_index)
+
+        result_s = test_function(s_representation_vectors, random_state=42)
+
+        # check_less_precise True to prevent rounding errors from giving a Failure.
+        pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True)
+
+    @parameterized.expand(test_cases_dim_reduction)
+    def test_dim_reduction_arguments_to_sklearn(self, name, test_function, *args):
+        try:
+            test_function(s_representation_vectors, n_components=2, random_state=42)
         except TypeError:
             self.fail("Sklearn arguments not handled correctly.")
 

diff --git a/texthero/representation.py b/texthero/representation.py
@@ -430,8 +430,17 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
     are easily visible. The corpus can now be visualized in 3D and we can
     get a good first view of the data!
 
+    Be careful: PCA can *not* handle sparse input, so even when calling PCA with
+    a very sparse Representation Series, internally texthero will compute
+    the whole dense representation, so if you're working with big datasets,
+    you should probably use :meth:`texthero.representation.nmf` or
+    :meth:`texthero.representation.tsne` as they can handle sparse input.
+
     In general, *pca* should be called after the text has already been represented to a matrix form.
 
+    The input has to be a Representation Series.
+    TODO add typing module link
+
     Parameters
     ----------
     s : Pandas Series
@@ -466,9 +475,40 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
     --------
     `PCA on Wikipedia <https://en.wikipedia.org/wiki/Principal_component_analysis>`_
 
+    Representation Series: TODO add tutorial link and typing module link
+
     """
     pca = PCA(n_components=n_components, random_state=random_state, copy=False)
-    return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index)
+
+    if _check_is_valid_representation(s):
+
+        if pd.api.types.is_sparse(s):
+            s_coo_matrix = s.sparse.to_coo()[0]
+            if s_coo_matrix.shape[1] > 1000:
+                warnings.warn(
+                    "Be careful. You are trying to compute PCA from a Sparse Pandas Series with a very large vocabulary."
+                    " Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix."
+                    " This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently."
+                )
+        else:
+            # Treat it as a Sparse matrix anyway for efficiency.
+            s = s.astype("Sparse")
+            s_coo_matrix = s.sparse.to_coo()[0]
+
+        s_for_vectorization = s_coo_matrix.todense()  # PCA cannot handle sparse input.
+
+    # Else: no Representation Series -> fail
+    else:
+        raise ValueError(
+            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
+        )
+
+    s_out = pd.Series(
+        pca.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0),
+    )
+    s_out = s_out.rename_axis(None)
+
+    return s_out
 
 
 def nmf(s, n_components=2, random_state=None) -> pd.Series:
@@ -488,6 +528,8 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
     and calculate a vector for each document that places it
     correctly among the topics.
 
+    The input has to be a Representation Series.
+    TODO add tutorial link
 
     Parameters
     ----------
@@ -525,9 +567,35 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
     --------
     `NMF on Wikipedia <https://en.wikipedia.org/wiki/Non-negative_matrix_factorization>`_
 
+    Representation Series: TODO add tutorial link and typing module link
     """
-    nmf = NMF(n_components=n_components, init="random", random_state=random_state,)
-    return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index)
+
+    nmf = NMF(n_components=n_components, init=None, random_state=random_state)
+
+    if _check_is_valid_representation(s):
+
+        if pd.api.types.is_sparse(s):
+            s_coo_matrix = s.sparse.to_coo()[0]
+        else:
+            # Treat it as a Sparse matrix anyway for efficiency.
+            s = s.astype("Sparse")
+            s_coo_matrix = s.sparse.to_coo()[0]
+
+        s_for_vectorization = s_coo_matrix  # NMF can work with sparse input.
+
+    # Else: no Representation Series -> fail
+    else:
+        raise ValueError(
+            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
+        )
+
+    s_out = pd.Series(
+        nmf.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0),
+    )
+
+    s_out = s_out.rename_axis(None)
+
+    return s_out
 
 
 def tsne(
@@ -554,6 +622,8 @@ def tsne(
     vector in such a way that the differences / similarities between
     documents are preserved.
 
+    The input has to be a Representation Series.
+    TODO add typing module link
 
     Parameters
     ----------
@@ -610,6 +680,8 @@ def tsne(
     --------
     `t-SNE on Wikipedia <https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding>`_
 
+    Representation Series: TODO add tutorial link and typing module link
+
     """
     tsne = TSNE(
         n_components=n_components,
@@ -619,7 +691,31 @@ def tsne(
         random_state=random_state,
         n_jobs=n_jobs,
     )
-    return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index)
+
+    if _check_is_valid_representation(s):
+
+        if pd.api.types.is_sparse(s):
+            s_coo_matrix = s.sparse.to_coo()[0]
+        else:
+            # Treat it as a Sparse matrix anyway for efficiency.
+            s = s.astype("Sparse")
+            s_coo_matrix = s.sparse.to_coo()[0]
+
+        s_for_vectorization = s_coo_matrix  # TSNE can work with sparse input.
+
+    # Else: no Representation Series -> fail
+    else:
+        raise ValueError(
+            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
+        )
+
+    s_out = pd.Series(
+        tsne.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0)
+    )
+
+    s_out = s_out.rename_axis(None)
+
+    return s_out
 
 
 """

diff --git a/texthero/visualization.py b/texthero/visualization.py
@@ -62,7 +62,7 @@ def scatterplot(
     >>> import pandas as pd
     >>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"])
     >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize)
-    >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten
+    >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3)
     >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten
     >>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP
     """