From fa342a92d4f007cebfce29f1f22a4e31fedc56c6 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 18 Aug 2020 22:06:14 +0200 Subject: [PATCH 01/18] added MultiIndex DF support suport MultiIndex as function parameter returns MultiIndex, where Representation was returned * missing: correct test Co-authored-by: Henri Froese --- tests/test_indexes.py | 18 +-- tests/test_representation.py | 63 +------- texthero/representation.py | 294 +++++++++++++---------------------- texthero/visualization.py | 4 +- 4 files changed, 115 insertions(+), 264 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index cc041c3a..af7afcd2 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -56,21 +56,9 @@ ] test_cases_representation = [ - [ - "count", - lambda x: representation.flatten(representation.count(x)), - (s_tokenized_lists,), - ], - [ - "term_frequency", - lambda x: representation.flatten(representation.term_frequency(x)), - (s_tokenized_lists,), - ], - [ - "tfidf", - lambda x: representation.flatten(representation.tfidf(x)), - (s_tokenized_lists,), - ], + ["count", representation.count, (s_tokenized_lists,),], + ["term_frequency", representation.term_frequency, (s_tokenized_lists,),], + ["tfidf", representation.tfidf, (s_tokenized_lists,),], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists,)], ["tsne", representation.tsne, (s_numeric_lists,)], diff --git a/tests/test_representation.py b/tests/test_representation.py index 036775af..41b81ffa 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,16 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = pd.MultiIndex.from_tuples( - [(0, "!"), (0, "TEST"), (0, "Test"), (1, "."), (1, "?"), (1, "Test")], -) - -s_tokenized_output_noncontinuous_index = pd.MultiIndex.from_tuples( - [(5, "!"), (5, "TEST"), (5, "Test"), (7, "."), (7, "?"), (7, "Test")], -) - -s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],) +s_tokenized_output_index = [0,1] +s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above, dtype of output] @@ -182,55 +175,3 @@ def test_tfidf_formula(self): ).astype("Sparse") self.assertEqual(representation.tfidf(s), s_true) - - """ - flatten. - """ - - def test_flatten(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0]], index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s), s_true, check_names=False - ) - - def test_flatten_fill_missing_with(self): - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, "FILLED", np.nan], ["FILLED", 4.0, "FILLED"]], - index=["doc0", "doc1"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, fill_missing_with="FILLED"), - s_true, - check_names=False, - ) - - def test_flatten_missing_row(self): - # Simulating a row with no features, so it's completely missing from - # the representation series. - index = pd.MultiIndex.from_tuples( - [("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], - ) - s = pd.Series([3, np.nan, 4], index=index) - - s_true = pd.Series( - [[3.0, 0.0, np.nan], [0.0, 4.0, 0.0], [0.0, 0.0, 0.0]], - index=["doc0", "doc1", "doc2"], - ) - - pd.testing.assert_series_equal( - representation.flatten(s, index=s_true.index), s_true, check_names=False - ) diff --git a/texthero/representation.py b/texthero/representation.py index 07b7706c..042db71a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -27,90 +27,14 @@ """ -def flatten( - s: Union[pd.Series, pd.Series.sparse], - index: pd.Index = None, - fill_missing_with: Any = 0.0, -) -> pd.Series: - """ - Transform a Pandas Representation Series to a "normal" (flattened) Pandas Series. - - The given Series should have a multiindex with first level being the document - and second level being individual features of that document (e.g. tdidf scores per word). - The flattened Series has one cell per document, with the cell being a list of all - the individual features of that document. - - Parameters - ---------- - s : Sparse Pandas Series or Pandas Series - The multiindexed Pandas Series to flatten. - - index : Pandas Index, optional, default to None - The index the flattened Series should have. - - fill_missing_with : Any, default to 0.0 - Value to fill the NaNs (missing values) with. This _does not_ mean - that existing values that are np.nan are replaced, but rather that - features that are not present in one document but present in others - are filled with fill_missing_with. See example below. - - - Examples - -------- - >>> import texthero as hero - >>> import pandas as pd - >>> import numpy as np - >>> index = pd.MultiIndex.from_tuples([("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], names=['document', 'word']) - >>> s = pd.Series([3, np.nan, 4], index=index) - >>> s - document word - doc0 Word1 3.0 - Word3 NaN - doc1 Word2 4.0 - dtype: float64 - >>> hero.flatten(s, fill_missing_with=0.0) - document - doc0 [3.0, 0.0, nan] - doc1 [0.0, 4.0, 0.0] - dtype: object - - """ - s = s.unstack(fill_value=fill_missing_with) - - if index is not None: - s = s.reindex(index, fill_value=fill_missing_with) - # Reindexing makes the documents for which no values - # are present in the Sparse Representation Series - # "reappear" correctly. - - s = pd.Series(s.values.tolist(), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: +def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: """ - Check if the given Pandas Series is a Document Representation Series. + Check if the given Pandas Series is a Document Term DF. - Returns true if Series is Document Representation Series, else False. + Returns true if input is Document Term DF, else False. """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True + return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) # Warning message for not-tokenized inputs @@ -132,11 +56,11 @@ def count( min_df=1, max_df=1.0, binary=False, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using count. - Return a Document Representation Series with the + Return a Document Term DataFrame with the number of occurences of a document's words for every document. TODO add tutorial link @@ -144,10 +68,6 @@ def count( The input Series should already be tokenized. If not, it will be tokenized before count is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -177,15 +97,14 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - 0 Sentence 1 - one 1 - 1 Sentence 1 - two 1 - dtype: Sparse[int64, 0] + count + Sentence one two + 0 1 1 0 + 1 1 0 1 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -204,25 +123,23 @@ def count( ) tf_vectors_csr = tf.fit_transform(s) - tf_vectors_coo = coo_matrix(tf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tf_vectors_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("count", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tf_vectors_csr, s.index, multiindexed_columns + ) def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, -) -> pd.Series: +) -> pd.DataFrame: """ Represent a text-based Pandas Series using term frequency. - Return a Document Representation Series with the + Return a Document Term DataFrame with the term frequencies of the terms for every document. TODO add tutorial link @@ -230,11 +147,6 @@ def term_frequency( The input Series should already be tokenized. If not, it will be tokenized before term_frequency is calculated. - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - - Parameters ---------- s : Pandas Series (tokenized) @@ -261,16 +173,14 @@ def term_frequency( >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) >>> hero.term_frequency(s) - 0 Sentence 0.2 - hey 0.2 - one 0.2 - 1 Sentence 0.2 - two 0.2 - dtype: Sparse[float64, nan] + term_frequency + Sentence hey one two + 0 0.2 0.2 0.2 0.0 + 1 0.2 0.0 0.0 0.2 See Also -------- - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): @@ -291,17 +201,16 @@ def term_frequency( total_count_coo = np.sum(tf_vectors_coo) frequency_coo = np.divide(tf_vectors_coo, total_count_coo) - s_out = pd.Series.sparse.from_coo(frequency_coo) - - features_names = tf.get_feature_names() - - # Map word index to word name - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("term_frequency", word) for word in tf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + frequency_coo, s.index, multiindexed_columns + ) -def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: +def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame: """ Represent a text-based Pandas Series using TF-IDF. @@ -324,20 +233,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: so the result is exactly what you get applying the formula described above. - Return a Document Representation Series with the + Return a Document Term DataFrame with the tfidf of every word in the document. TODO add tutorial link The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. - If working with big pandas Series, you might want to limit - the number of features through the max_features parameter. - - Use :meth:`hero.representation.flatten` on the output to get - a standard Pandas Series with the document vectors - in every cell. - Parameters ---------- s : Pandas Series (tokenized) @@ -365,17 +267,16 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s) - 0 Bye 1.000000 - Hi 1.405465 - 1 Bye 2.000000 - Test 1.405465 - dtype: Sparse[float64, nan] + tfidf + Bye Hi Test + 0 1.0 1.405465 0.000000 + 1 2.0 0.000000 1.405465 See Also -------- `TF-IDF on Wikipedia `_ - Document Representation Series: TODO add tutorial link + Document Term DataFrame: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. @@ -395,16 +296,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: tfidf_vectors_csr = tfidf.fit_transform(s) - # Result from sklearn is in Compressed Sparse Row format. - # Pandas Sparse Series can only be initialized from Coordinate format. - tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr) - s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo) - - # Map word index to word name and keep original index of documents. - feature_names = tfidf.get_feature_names() - s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]])) + multiindexed_columns = pd.MultiIndex.from_tuples( + [("tfidf", word) for word in tfidf.get_feature_names()] + ) - return s_out + return pd.DataFrame.sparse.from_spmatrix( + tfidf_vectors_csr, s.index, multiindexed_columns + ) """ @@ -412,7 +310,9 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series: """ -def pca(s, n_components=2, random_state=None) -> pd.Series: +def pca( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Perform principal component analysis on the given Pandas Series. @@ -434,7 +334,7 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or MuliIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -468,10 +368,18 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.values + else: + values = list(s) + + return pd.Series(pca.fit_transform(values).tolist(), index=s.index) -def nmf(s, n_components=2, random_state=None) -> pd.Series: +def nmf( + s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None +) -> pd.Series: """ Performs non-negative matrix factorization. @@ -491,7 +399,7 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -527,11 +435,17 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: """ nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) def tsne( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_components=2, perplexity=30.0, learning_rate=200.0, @@ -557,7 +471,7 @@ def tsne( Parameters ---------- - s : Pandas Series + s : Pandas Series or Pandas MultiIndex Sparse DataFrame n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -619,7 +533,13 @@ def tsne( random_state=random_state, n_jobs=n_jobs, ) - return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_DocumentTermDF(s): + values = s.sparse.to_coo() + else: + values = list(s) + + return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) """ @@ -628,7 +548,7 @@ def tsne( def kmeans( - s: pd.Series, + s: Union[pd.Series, pd.DataFrame], n_clusters=5, n_init=10, max_iter=300, @@ -653,7 +573,7 @@ def kmeans( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -686,7 +606,7 @@ def kmeans( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.kmeans(s, n_clusters=2, random_state=42) 0 1 1 0 @@ -702,7 +622,12 @@ def kmeans( `kmeans on Wikipedia `_ """ - vectors = list(s) + + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + kmeans = KMeans( n_clusters=n_clusters, n_init=n_init, @@ -715,7 +640,7 @@ def kmeans( def dbscan( - s, + s: Union[pd.Series, pd.DataFrame], eps=0.5, min_samples=5, metric="euclidean", @@ -743,7 +668,7 @@ def dbscan( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -783,7 +708,7 @@ def dbscan( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, enjoy, guitar"]) - >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten + >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> hero.dbscan(s, min_samples=1, eps=4) 0 0 1 1 @@ -801,6 +726,11 @@ def dbscan( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.sparse.to_coo() + else: + vectors = list(s) + return pd.Series( DBSCAN( eps=eps, @@ -809,13 +739,13 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") def meanshift( - s, + s: Union[pd.Series, pd.DataFrame], bandwidth=None, bin_seeding=False, min_bin_freq=1, @@ -843,7 +773,7 @@ def meanshift( Parameters ---------- - s: Pandas Series + s: Pandas Series or Pandas MultiIndex Sparse DataFrame bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -901,6 +831,11 @@ def meanshift( """ + if _check_is_valid_DocumentTermDF(s): + vectors = s.values + else: + vectors = list(s) + return pd.Series( MeanShift( bandwidth=bandwidth, @@ -909,7 +844,7 @@ def meanshift( cluster_all=cluster_all, n_jobs=n_jobs, max_iter=max_iter, - ).fit_predict(list(s)), + ).fit_predict(vectors), index=s.index, ).astype("category") @@ -962,31 +897,18 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: `Norm on Wikipedia `_ """ + isDocumentTermDF = _check_is_valid_DocumentTermDF(s) - is_valid_representation = ( - isinstance(s.index, pd.MultiIndex) and s.index.nlevels == 2 - ) - - if not is_valid_representation: - raise TypeError( - "The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - ) - # TODO after merging representation: use _check_is_valid_representation instead - - if pd.api.types.is_sparse(s): - s_coo_matrix = s.sparse.to_coo()[0] + if isDocumentTermDF: + s_for_vectorization = s.sparse.to_coo() else: - s = s.astype("Sparse") - s_coo_matrix = s.sparse.to_coo()[0] - - s_for_vectorization = s_coo_matrix + s_for_vectorization = list(s) result = sklearn_normalize( s_for_vectorization, norm=norm ) # Can handle sparse input. - result_coo = coo_matrix(result) - s_result = pd.Series.sparse.from_coo(result_coo) - s_result.index = s.index - - return s_result + if isDocumentTermDF: + return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) + else: + return pd.Series(result.tolist(), index=s.index) diff --git a/texthero/visualization.py b/texthero/visualization.py index e213285e..2426ab4d 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -63,8 +63,8 @@ def scatterplot( >>> import pandas as pd >>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) - >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten - >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten + >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3) + >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.kmeans, n_clusters=2) >>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP """ From 59a9f8c0df70d8136780b3160bc1d2ca59f48b26 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Wed, 19 Aug 2020 19:39:30 +0200 Subject: [PATCH 02/18] beginning with tests --- tests/test_representation.py | 147 +++++++++++++++++------------------ texthero/representation.py | 8 +- 2 files changed, 76 insertions(+), 79 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 41b81ffa..d4acd369 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,32 +50,84 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0,1] +s_tokenized_output_index = [0, 1] + +s_tokenized_output_index_noncontinous = [5, 7] + + +def _get_multiindex_for_tokenized_output(first_level_name): + return pd.MultiIndex.from_product( + [[first_level_name], ["!", ".", "?", "TEST", "Test"]] + ) -s_tokenized_output_index_noncontinous = [5,7] test_cases_vectorization = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [1, 1, 2, 2, 1, 1], "int"], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("count"), + ).astype("Sparse"), + ], [ "term_frequency", representation.term_frequency, - [0.125, 0.125, 0.250, 0.250, 0.125, 0.125], - "float", + pd.DataFrame( + [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("term_frequency"), + ).astype("Sparse"), ], [ "tfidf", representation.tfidf, - [_tfidf(x[1], s_tokenized, x[0]) for x in s_tokenized_output_index], - "float", + pd.DataFrame( + [ + [ + _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here + for x in ["!", ".", "?", "TEST", "Test"] + ], + [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + ], + index=s_tokenized_output_index, + columns=_get_multiindex_for_tokenized_output("tfidf"), + ).astype("Sparse"), ], ] + test_cases_vectorization_min_df = [ - # format: [function_name, function, correct output for tokenized input above, dtype of output] - ["count", representation.count, [2, 1], "int"], - ["term_frequency", representation.term_frequency, [0.666667, 0.333333], "float",], - ["tfidf", representation.tfidf, [2.0, 1.0], "float",], + # format: [function_name, function, correct output for tokenized input above] + [ + "count", + representation.count, + pd.DataFrame( + [2, 1], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("count", "Test")]), + ).astype("Sparse"), + ], + [ + "term_frequency", + representation.term_frequency, + pd.DataFrame( + [0.666667, 0.333333], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), + ).astype("Sparse"), + ], + [ + "tfidf", + representation.tfidf, + pd.DataFrame( + [2.0, 1.0], + index=s_tokenized_output_index, + columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), + ).astype("Sparse"), + ], ] @@ -91,62 +143,23 @@ class AbstractRepresentationTest(PandasTestCase): """ @parameterized.expand(test_cases_vectorization) - def test_vectorization_simple( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="int" - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, index=s_tokenized_output_index, dtype="float" - ).astype(pd.SparseDtype("float", np.nan)) + def test_vectorization_simple(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( - self, name, test_function, correct_output_values, int_or_float + self, name, test_function, correct_output=None ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_noncontinuous_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - result_s = test_function(s_tokenized_with_noncontinuous_index) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) @parameterized.expand(test_cases_vectorization_min_df) - def test_vectorization_min_df( - self, name, test_function, correct_output_values, int_or_float - ): - if int_or_float == "int": - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="int", - ).astype(pd.SparseDtype(np.int64, 0)) - else: - s_true = pd.Series( - correct_output_values, - index=s_tokenized_output_min_df_index, - dtype="float", - ).astype(pd.SparseDtype("float", np.nan)) - + def test_vectorization_min_df(self, name, test_function, correct_output): + s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - - pd.testing.assert_series_equal(s_true, result_s) + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -159,19 +172,3 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") - - """ - Individual / special tests. - """ - - def test_tfidf_formula(self): - s = pd.Series(["Hi Bye", "Test Bye Bye"]) - s = preprocessing.tokenize(s) - s_true_index = pd.MultiIndex.from_tuples( - [(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")], - ) - s_true = pd.Series( - [_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index - ).astype("Sparse") - - self.assertEqual(representation.tfidf(s), s_true) diff --git a/texthero/representation.py b/texthero/representation.py index 042db71a..efabc9c6 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,11 +97,11 @@ def count( >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) - count - Sentence one two + count + Sentence one two 0 1 1 0 1 1 0 1 - +# FIXME columns pandas doctest See Also -------- Document Term DataFrame: TODO add tutorial link @@ -375,7 +375,7 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) - +# FIXME: merge master again def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None From 19c52de3f5ae6a1a01e4262dca00ea5177718311 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 19 Aug 2020 22:02:41 +0200 Subject: [PATCH 03/18] implemented correct sparse support *missing: test adopting for new types Co-authored-by: Henri Froese --- tests/test_representation.py | 12 ++++---- texthero/representation.py | 59 +++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index d4acd369..7c02ccd2 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -70,7 +70,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("count"), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -108,7 +108,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("count", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[int64, 0]"), ], [ "term_frequency", @@ -123,7 +123,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): "tfidf", representation.tfidf, pd.DataFrame( - [2.0, 1.0], + [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), ).astype("Sparse"), @@ -146,20 +146,20 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal(s_tokenized_output_index_noncontinous, result_s) + pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): diff --git a/texthero/representation.py b/texthero/representation.py index efabc9c6..ff691212 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -101,9 +101,12 @@ def count( Sentence one two 0 1 1 0 1 1 0 1 -# FIXME columns pandas doctest + See Also -------- + + # FIXME columns pandas doctest + Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -375,8 +378,11 @@ def pca( values = list(s) return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + + # FIXME: merge master again + def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -437,11 +443,12 @@ def nmf( nmf = NMF(n_components=n_components, init="random", random_state=random_state,) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(values).tolist(), index=s.index) + return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) def tsne( @@ -535,11 +542,12 @@ def tsne( ) if _check_is_valid_DocumentTermDF(s): - values = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - values = list(s) + s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(values).tolist(), index=s.index) + return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) """ @@ -624,9 +632,10 @@ def kmeans( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) kmeans = KMeans( n_clusters=n_clusters, @@ -635,8 +644,8 @@ def kmeans( random_state=random_state, copy_x=True, algorithm=algorithm, - ).fit(vectors) - return pd.Series(kmeans.predict(vectors), index=s.index).astype("category") + ).fit(s_for_vectorization) + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") def dbscan( @@ -727,9 +736,10 @@ def dbscan( """ if _check_is_valid_DocumentTermDF(s): - vectors = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: - vectors = list(s) + s_for_vectorization = list(s) return pd.Series( DBSCAN( @@ -739,7 +749,7 @@ def dbscan( metric_params=metric_params, leaf_size=leaf_size, n_jobs=n_jobs, - ).fit_predict(vectors), + ).fit_predict(s_for_vectorization), index=s.index, ).astype("category") @@ -877,17 +887,15 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> idx = pd.MultiIndex.from_tuples( - ... [(0, "a"), (0, "b"), (1, "c"), (1, "d")], names=("document", "word") - ... ) - >>> s = pd.Series([1, 2, 3, 4], index=idx) + >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) + >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") >>> hero.normalize(s, norm="max") - document word - 0 a 0.50 - b 1.00 - 1 c 0.75 - d 1.00 - dtype: Sparse[float64, nan] + 0 1 + a b c d + 0 0.250000 0.500000 0.75 1.000000 + 1 0.571429 0.285714 1.00 0.714286 + 2 0.400000 0.400000 0.60 1.000000 + 3 0.111111 0.222222 1.00 0.888889 See Also @@ -900,7 +908,8 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: isDocumentTermDF = _check_is_valid_DocumentTermDF(s) if isDocumentTermDF: - s_for_vectorization = s.sparse.to_coo() + s_coo = s.sparse.to_coo() + s_for_vectorization = s_coo.astype("float64") else: s_for_vectorization = list(s) From 41f55a8a359f15ce4ba65e1e726b9e0757fc596b Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:20:02 +0200 Subject: [PATCH 04/18] added back list() and rm .tolist() --- texthero/representation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 048b42ec..025652d9 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,7 +37,7 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(s.values.tolist(), index=s.index) + s = pd.Series(list(s.values), index=s.index) return s @@ -415,7 +415,7 @@ def pca( else: values = list(s) - return pd.Series(pca.fit_transform(values).tolist(), index=s.index) + return pd.Series(list(pca.fit_transform(values)), index=s.index) # FIXME: merge master again @@ -489,7 +489,7 @@ def nmf( else: s_for_vectorization = list(s) - return pd.Series(nmf.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(nmf.fit_transform(s_for_vectorization)), index=s.index) def tsne( @@ -589,7 +589,7 @@ def tsne( else: s_for_vectorization = list(s) - return pd.Series(tsne.fit_transform(s_for_vectorization).tolist(), index=s.index) + return pd.Series(list(tsne.fit_transform(s_for_vectorization)), index=s.index) """ @@ -963,4 +963,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(result.tolist(), index=s.index) + return pd.Series(list(result), index=s.index) From 217611a2c648db4044d240a9c12a157b94b36bca Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:21:41 +0200 Subject: [PATCH 05/18] rm .tolist() and added list() --- texthero/representation.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 025652d9..fdab73dd 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -37,36 +37,6 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) - s = pd.Series(list(s.values), index=s.index) - - return s - - -def _check_is_valid_representation(s: pd.Series) -> bool: - """ - Check if the given Pandas Series is a Document Representation Series. - - Returns true if Series is Document Representation Series, else False. - - """ - - # TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError" - - if not isinstance(s.index, pd.MultiIndex): - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" - # ) - - if s.index.nlevels != 2: - return False - # raise ValueError( - # f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." - # ) - - return True - - # Warning message for not-tokenized inputs _not_tokenized_warning_message = ( "It seems like the given Pandas Series s is not tokenized. This function will" @@ -963,4 +933,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series(list(result), index=s.index) + return pd.Series((result), index=s.index) From 6a3b56d1a56401880efa7cfa7dd32668e23b25ea Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:41:22 +0200 Subject: [PATCH 06/18] Adopted the test to the new dataframes --- tests/test_representation.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 7c02ccd2..3564730e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -90,7 +90,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): _tfidf(x, s_tokenized, 0) # Testing the tfidf formula here for x in ["!", ".", "?", "TEST", "Test"] ], - [_tfidf(x, s_tokenized, 0) for x in ["!", ".", "?", "TEST", "Test"]], + [_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]], ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), @@ -146,20 +146,28 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_frame_equal(s_tokenized_output_index_noncontinous, result_s.index, check_dtype = False) + pd.testing.assert_series_equal( + pd.Series(s_tokenized_output_index_noncontinous), + pd.Series(result_s.index), + check_dtype=False, + ) @parameterized.expand(test_cases_vectorization_min_df) def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal(s_true, result_s, check_less_precise=True, check_dtype = False) + pd.testing.assert_frame_equal( + s_true, result_s, check_less_precise=True, check_dtype=False + ) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): From b8ff5611e550f5f4bc023b2b76ef8ebcff7f8021 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 21 Aug 2020 10:41:35 +0200 Subject: [PATCH 07/18] wrong format --- texthero/representation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index fdab73dd..ac0a458f 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -657,7 +657,9 @@ def kmeans( copy_x=True, algorithm=algorithm, ).fit(s_for_vectorization) - return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype("category") + return pd.Series(kmeans.predict(s_for_vectorization), index=s.index).astype( + "category" + ) def dbscan( From e3af2f9da094505861cddc420f57490700ca88ef Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 21 Aug 2020 18:48:51 +0200 Subject: [PATCH 08/18] Address most review comments. --- tests/test_representation.py | 19 ++++++++-------- texthero/representation.py | 42 +++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 3564730e..5f985996 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -50,9 +50,9 @@ def _tfidf(term, corpus, document_index): [["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7] ) -s_tokenized_output_index = [0, 1] +s_tokenized_output_index = pd.Index([0, 1]) -s_tokenized_output_index_noncontinous = [5, 7] +s_tokenized_output_index_noncontinous = pd.Index([5, 7]) def _get_multiindex_for_tokenized_output(first_level_name): @@ -79,7 +79,8 @@ def _get_multiindex_for_tokenized_output(first_level_name): [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("term_frequency"), - ).astype("Sparse"), + dtype="Sparse", + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -94,7 +95,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): ], index=s_tokenized_output_index, columns=_get_multiindex_for_tokenized_output("tfidf"), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -117,7 +118,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [0.666667, 0.333333], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], [ "tfidf", @@ -126,7 +127,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [2, 1], index=s_tokenized_output_index, columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), - ).astype("Sparse"), + ).astype("Sparse[float64, nan]"), ], ] @@ -155,10 +156,8 @@ def test_vectorization_noncontinuous_index_kept( self, name, test_function, correct_output=None ): result_s = test_function(s_tokenized_with_noncontinuous_index) - pd.testing.assert_series_equal( - pd.Series(s_tokenized_output_index_noncontinous), - pd.Series(result_s.index), - check_dtype=False, + pd.testing.assert_index_equal( + s_tokenized_output_index_noncontinous, result_s.index ) @parameterized.expand(test_cases_vectorization_min_df) diff --git a/texthero/representation.py b/texthero/representation.py index ac0a458f..7793cb2b 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -145,7 +145,7 @@ def term_frequency( Return a Document Term DataFrame with the term frequencies of the terms for every - document. + document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -241,7 +241,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram formula described above. Return a Document Term DataFrame with the - tfidf of every word in the document. + tfidf of every word in the document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will @@ -341,9 +341,13 @@ def pca( In general, *pca* should be called after the text has already been represented to a matrix form. + PCA cannot directly handle sparse input, so when calling pca on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s : Pandas Series or MuliIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -388,9 +392,6 @@ def pca( return pd.Series(list(pca.fit_transform(values)), index=s.index) -# FIXME: merge master again - - def nmf( s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None ) -> pd.Series: @@ -410,10 +411,12 @@ def nmf( n_components many topics (clusters) and calculate a vector for each document that places it correctly among the topics. + NMF can directly handle sparse input, so when calling nmf on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : Int. Default is 2. Number of components to keep (dimensionality of output vectors). @@ -484,10 +487,12 @@ def tsne( document gets a new, low-dimensional (n_components entries) vector in such a way that the differences / similarities between documents are preserved. + T-SNE can directly handle sparse input, so when calling tsne on a + DocumentTermDF, the advantage of sparseness is kept. Parameters ---------- - s : Pandas Series or Pandas MultiIndex Sparse DataFrame + s : Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). @@ -591,9 +596,12 @@ def kmeans( function that assigns a scalar (a weight) to each word), K-means will find k topics (clusters) and assign a topic to each document. + Kmeans can directly handle sparse input, so when calling kmeans on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) n_clusters: Int, default to 5. The number of clusters to separate the data into. @@ -689,9 +697,12 @@ def dbscan( function that assigns a scalar (a weight) to each word), DBSCAN will find topics (clusters) and assign a topic to each document. + DBSCAN can directly handle sparse input, so when calling dbscan on a + DocumentTermDF, the advantage of sparseness is kept. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) eps : float, default=0.5 The maximum distance between two samples for one to be considered @@ -795,9 +806,13 @@ def meanshift( function that assigns a scalar (a weight) to each word), mean shift will find topics (clusters) and assign a topic to each document. + Menashift cannot directly handle sparse input, so when calling meanshift on a + DocumentTermDF, the input has to be expanded which can lead to + memory problems with big datasets. + Parameters ---------- - s: Pandas Series or Pandas MultiIndex Sparse DataFrame + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) bandwidth : float, default=None Bandwidth used in the RBF kernel. @@ -889,11 +904,12 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. - Input has to be a Representation Series. + Input can be VectorSeries or DocumentTermDF. For DocumentTermDFs, + the sparseness is kept. Parameters ---------- - s: Pandas Series + s: Pandas Series (VectorSeries) or MultiIndex Sparse DataFrame (DocumentTermDF) norm: str, default to "l2" One of "l1", "l2", or "max". The norm that is used. From 77ad80ecf8977a098b73c4f12c8f28951c769dfc Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 21 Aug 2020 19:45:48 +0200 Subject: [PATCH 09/18] Add more unittests for representation --- tests/test_representation.py | 118 +++++++++++++++++++++++++++++++++-- texthero/representation.py | 14 ++--- 2 files changed, 118 insertions(+), 14 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 5f985996..2722289e 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -132,6 +132,50 @@ def _get_multiindex_for_tokenized_output(first_level_name): ] +s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) +s_documenttermDF = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), +).astype("Sparse[float64, nan]") + + +test_cases_dim_reduction_and_clustering = [ + # format: [function_name, function, correct output for s_vector_series and s_documenttermDF input above] + ["pca", representation.pca, pd.Series([[-0.5, 0.0], [0.5, 0.0]], index=[5, 7],),], + [ + "nmf", + representation.nmf, + pd.Series([[5.119042424626627, 0.0], [0.0, 0.0]], index=[5, 7],), + ], + [ + "tsne", + representation.tsne, + pd.Series([[164.86682, 1814.1647], [-164.8667, -1814.1644]], index=[5, 7],), + ], + [ + "kmeans", + representation.kmeans, + pd.Series([1, 0], index=[5, 7], dtype="category"), + ], + [ + "dbscan", + representation.dbscan, + pd.Series([-1, -1], index=[5, 7], dtype="category"), + ], + [ + "meanshift", + representation.meanshift, + pd.Series([0, 1], index=[5, 7], dtype="category"), + ], + [ + "normalize", + representation.normalize, + pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7],), + ], +] + + class AbstractRepresentationTest(PandasTestCase): """ Class for representation test cases. Most tests are @@ -147,9 +191,7 @@ class AbstractRepresentationTest(PandasTestCase): def test_vectorization_simple(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_noncontinuous_index_kept( @@ -164,9 +206,7 @@ def test_vectorization_noncontinuous_index_kept( def test_vectorization_min_df(self, name, test_function, correct_output): s_true = correct_output result_s = test_function(s_tokenized, min_df=2) - pd.testing.assert_frame_equal( - s_true, result_s, check_less_precise=True, check_dtype=False - ) + pd.testing.assert_frame_equal(s_true, result_s, check_dtype=False) @parameterized.expand(test_cases_vectorization) def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *args): @@ -179,3 +219,69 @@ def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) except TypeError: self.fail("Sklearn arguments not handled correctly.") + + """ + Dimensionality Reduction and Clustering + """ + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_vector_series_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "kmeans": + result_s = test_function(s_vector_series, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_vector_series) + else: + result_s = test_function(s_vector_series, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + @parameterized.expand(test_cases_dim_reduction_and_clustering) + def test_dim_reduction_and_clustering_with_documenttermDF_input( + self, name, test_function, correct_output + ): + s_true = correct_output + + if name == "normalize": + # testing this below separately + return + + if name == "kmeans": + result_s = test_function(s_documenttermDF, random_state=42, n_clusters=2) + elif name == "dbscan" or name == "meanshift" or name == "normalize": + result_s = test_function(s_documenttermDF) + else: + result_s = test_function(s_documenttermDF, random_state=42) + + pd.testing.assert_series_equal( + s_true, + result_s, + check_dtype=False, + rtol=0.1, + atol=0.1, + check_category_order=False, + ) + + def test_normalize_documenttermDF_also_as_output(self): + # normalize should also return DocumentTermDF output for DocumentTermDF + # input so we test it separately + result = representation.normalize(s_documenttermDF) + correct_output = pd.DataFrame( + [[1.0, 0.0], [0.0, 0.0]], + index=[5, 7], + columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), + ) + + pd.testing.assert_frame_equal( + result, correct_output, check_dtype=False, rtol=0.1, atol=0.1, + ) diff --git a/texthero/representation.py b/texthero/representation.py index 7793cb2b..8e876088 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -97,7 +97,7 @@ def count( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) - >>> hero.count(s) + >>> hero.count(s) # doctest: +SKIP count Sentence one two 0 1 1 0 @@ -106,8 +106,6 @@ def count( See Also -------- - # FIXME columns pandas doctest - Document Term DataFrame: TODO add tutorial link """ # TODO. Can be rewritten without sklearn. @@ -177,7 +175,7 @@ def term_frequency( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) - >>> hero.term_frequency(s) + >>> hero.term_frequency(s) # doctest: +SKIP term_frequency Sentence hey one two 0 0.2 0.2 0.2 0.0 @@ -273,7 +271,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) - >>> hero.tfidf(s) + >>> hero.tfidf(s) # doctest: +SKIP tfidf Bye Hi Test 0 1.0 1.405465 0.000000 @@ -900,7 +898,7 @@ def meanshift( """ -def normalize(s: pd.Series, norm="l2") -> pd.Series: +def normalize(s: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. @@ -920,7 +918,7 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: >>> import pandas as pd >>> col = pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "c"), (1, "d")]) >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], columns=col).astype("Sparse") - >>> hero.normalize(s, norm="max") + >>> hero.normalize(s, norm="max") # doctest: +SKIP 0 1 a b c d 0 0.250000 0.500000 0.75 1.000000 @@ -951,4 +949,4 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: if isDocumentTermDF: return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns) else: - return pd.Series((result), index=s.index) + return pd.Series(list(result), index=s.index) From c6ca37f0614551f388ef900cbe22394c461ddc0a Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Sat, 22 Aug 2020 16:03:13 +0200 Subject: [PATCH 10/18] implemented setItem *missing: unitTest Co-authored-by: Henri Froese --- texthero/__init__.py | 2 + texthero/_helper.py | 156 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/texthero/__init__.py b/texthero/__init__.py index 66e891e9..bc4341be 100644 --- a/texthero/__init__.py +++ b/texthero/__init__.py @@ -16,3 +16,5 @@ from .nlp import * from . import stopwords + +from . import _helper \ No newline at end of file diff --git a/texthero/_helper.py b/texthero/_helper.py index 6319c056..df3e8270 100644 --- a/texthero/_helper.py +++ b/texthero/_helper.py @@ -71,3 +71,159 @@ def wrapper(*args, **kwargs): return wrapper return decorator + + +''' +Pandas Integration of DocumentTermDF + +It's really important that users can seamlessly integrate texthero's function +output with their code. Let's assume a user has his documents in a DataFrame +`df["texts"]` that looks like this: + +``` +>>> df = pd.DataFrame(["Text of doc 1", "Text of doc 2", "Text of doc 3"], columns=["text"]) +>>> df + text +0 Text of doc 1 +1 Text of doc 2 +2 Text of doc 3 + +``` + + Let's look at an example output that `hero.count` could +return with the DocumentTermDF: + +``` +>>> hero.count(df["text"]) + count + 1 2 3 Text doc of +0 1 0 0 1 1 1 +1 0 1 0 1 1 1 +2 0 0 1 1 1 1 +``` + +That's a DataFrame. Great! Of course, users can +just store this somewhere as e.g. `df_count = hero.count(df["texts"])`, +and that works great. Accessing is then also as always: to get the +count values, they can just do `df_count.values` and have the count matrix +right there! + +However, what we see really often is users wanting to do this: +`df["count"] = hero.count(df["texts"])`. This sadly does not work out +of the box. The reason is that this subcolumn type is implemented +internally through a _Multiindex in the columns_. So we have + +``` +>>> df.columns +Index(['text'], dtype='object') +>>> hero.count(df["texts"]).columns +MultiIndex([('count', '1'), + ('count', '2'), + ('count', '3'), + ('count', 'Text'), + ('count', 'doc'), + ('count', 'of')], + ) + +``` + +Pandas _cannot_ automatically combine these. So what we will +do is this: Calling `df["count"] = hero.count(df["texts"])` is +internally this: `pd.DataFrame.__setitem__(self=df, key="count", value=hero.count(df["texts"]))`. +We will overwrite this method so that if _self_ is not multiindexed yet +and _value_ is multiindexed, we transform _self_ (so `df` here) to +be multiindexed and we can then easily integrate our column-multiindexed output from texthero: + +If `df` is multiindexed, we get the desired result through `pd.concat([df, hero.count(df["texts"])], axis=1)`. + +Pseudocode (& real code): working on this atm :3rd_place_medal: + +Advantages / Why does this work? + + - we don't destroy any pandas functionality as currently calling + `__setitem__` with a Multiindexed value is just not possible, so + our changes to Pandas do not break any Pandas functionality for + the users. We're only _expanding_ the functinoality + + - after multiindexing, users can still access their + "normal" columns like before; e.g. `df["texts"]` will + behave the same way as before even though it is now internally + multiindexed as `MultiIndex([('text', ''), ('count', '1'), + ('count', '2'), + ('count', '3'), + ('count', 'Text'), + ('count', 'doc'), + ('count', 'of')], + )`. + +Disadvantage: + + - poor performance, so we discurage user from using it, but we still want to support it +''' + +# Store the original __setitem__ function as _original__setitem__ +_pd_original__setitem__ = pd.DataFrame.__setitem__ +pd.DataFrame._original__setitem__ = _pd_original__setitem__ + + +# Define a new __setitem__ function that will replace pd.DataFrame.__setitem__ +def _hero__setitem__(self, key, value): + ''' + Called when doing self["key"] = value. + E.g. df["count"] = hero.count(df["texts"]) is internally doing + pd.DataFrame.__setitem__(self=df, key="count", value=hero.count(df["texts"]). + + So self is df, key is the new column's name, value is + what we want to put into the new column. + + What we do: + + 1. If user calls __setitem__ with value being multiindexed, e.g. + df["count"] = hero.count(df["texts"]), + so __setitem__(self=df, key="count", value=hero.count(df["texts"]) + + 2. we make self multiindexed if it isn't already + -> e.g. column "text" internally becomes multiindexed + to ("text", "") but users do _not_ notice this. + This is a very quick operation that does not need + to look at the df's values, we just reassign + self.columns + + 3. we change value's columns so the first level is named `key` + -> e.g. a user might do df["haha"] = hero.count(df["texts"]), + so just doing df[hero.count(df["texts"]).columns] = hero.count(df["texts"]) + would give him a new column that is named like texthero's output, + e.g. "count" instead of "haha". So we internally rename the + value columns (e.g. ('haha', '1'), + ('haha', '2'), + ('haha', '3'), + ('haha', 'Text'), + ('haha', 'doc'), + ('haha', 'of')]]) + + 4. we do self[value.columns] = value as that's exactly the command + that correctly integrates the multiindexed `value` into `self` + + ''' + + + # 1. + if isinstance(value, pd.DataFrame) and isinstance(value.columns, pd.MultiIndex) and isinstance(key, str): + + # 2. + if not isinstance(self.columns, pd.MultiIndex): + self.columns = pd.MultiIndex.from_tuples([(col_name, "") for col_name in self.columns.values]) + + # 3. + value.columns = pd.MultiIndex.from_tuples([(key, subcol_name) for _, subcol_name in value.columns.values]) + + # 4. + self[value.columns] = value + + else: + + self._original__setitem__(key, value) + + +# Replace __setitem__ with our custom function +pd.DataFrame.__setitem__ = _hero__setitem__ From 8731ea7fcf1cf050d905f5638663512dcabc18d6 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Sat, 22 Aug 2020 17:14:00 +0200 Subject: [PATCH 11/18] formated files --- texthero/__init__.py | 2 +- texthero/_helper.py | 33 ++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/texthero/__init__.py b/texthero/__init__.py index bc4341be..8998dd32 100644 --- a/texthero/__init__.py +++ b/texthero/__init__.py @@ -17,4 +17,4 @@ from . import stopwords -from . import _helper \ No newline at end of file +from . import _helper diff --git a/texthero/_helper.py b/texthero/_helper.py index df3e8270..91466028 100644 --- a/texthero/_helper.py +++ b/texthero/_helper.py @@ -73,7 +73,7 @@ def wrapper(*args, **kwargs): return decorator -''' +""" Pandas Integration of DocumentTermDF It's really important that users can seamlessly integrate texthero's function @@ -132,11 +132,11 @@ def wrapper(*args, **kwargs): internally this: `pd.DataFrame.__setitem__(self=df, key="count", value=hero.count(df["texts"]))`. We will overwrite this method so that if _self_ is not multiindexed yet and _value_ is multiindexed, we transform _self_ (so `df` here) to -be multiindexed and we can then easily integrate our column-multiindexed output from texthero: - -If `df` is multiindexed, we get the desired result through `pd.concat([df, hero.count(df["texts"])], axis=1)`. +be multiindexed and we can then easily integrate our column-multiindexed output from texthero. +See the implementation below for details. -Pseudocode (& real code): working on this atm :3rd_place_medal: +Additionally, we support this for pd.concat in a similar way; again, see the +implementation below for details. Advantages / Why does this work? @@ -159,7 +159,7 @@ def wrapper(*args, **kwargs): Disadvantage: - poor performance, so we discurage user from using it, but we still want to support it -''' +""" # Store the original __setitem__ function as _original__setitem__ _pd_original__setitem__ = pd.DataFrame.__setitem__ @@ -168,7 +168,7 @@ def wrapper(*args, **kwargs): # Define a new __setitem__ function that will replace pd.DataFrame.__setitem__ def _hero__setitem__(self, key, value): - ''' + """ Called when doing self["key"] = value. E.g. df["count"] = hero.count(df["texts"]) is internally doing pd.DataFrame.__setitem__(self=df, key="count", value=hero.count(df["texts"]). @@ -204,25 +204,32 @@ def _hero__setitem__(self, key, value): 4. we do self[value.columns] = value as that's exactly the command that correctly integrates the multiindexed `value` into `self` - ''' - + """ # 1. - if isinstance(value, pd.DataFrame) and isinstance(value.columns, pd.MultiIndex) and isinstance(key, str): + if ( + isinstance(value, pd.DataFrame) + and isinstance(value.columns, pd.MultiIndex) + and isinstance(key, str) + ): # 2. if not isinstance(self.columns, pd.MultiIndex): - self.columns = pd.MultiIndex.from_tuples([(col_name, "") for col_name in self.columns.values]) + self.columns = pd.MultiIndex.from_tuples( + [(col_name, "") for col_name in self.columns.values] + ) # 3. - value.columns = pd.MultiIndex.from_tuples([(key, subcol_name) for _, subcol_name in value.columns.values]) + value.columns = pd.MultiIndex.from_tuples( + [(key, subcol_name) for _, subcol_name in value.columns.values] + ) # 4. self[value.columns] = value else: - self._original__setitem__(key, value) + self._original__setitem__(key, value) # Replace __setitem__ with our custom function From 5c4db2fdf92d051d661c576a6490582a9b42a8ec Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Sat, 22 Aug 2020 17:42:25 +0200 Subject: [PATCH 12/18] Add tests for custom pandas setitem method --- .gitignore | 1 + tests/test_helpers.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/.gitignore b/.gitignore index 1b8d1ce7..fd29d505 100644 --- a/.gitignore +++ b/.gitignore @@ -184,3 +184,4 @@ dmypy.json # Cython debug symbols cython_debug/ docs/source/api +.vscode/launch.json diff --git a/tests/test_helpers.py b/tests/test_helpers.py index a0e4a195..d6197263 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -74,3 +74,32 @@ def f(s): with warnings.catch_warnings(): warnings.simplefilter("ignore") self.assertTrue(f(s).index.equals(s_true.index)) + + def test_pandas_set_item_normal(self): + df1 = pd.DataFrame([[1, 2], [5, 3]], columns=["Test", "Test2"]) + df2 = pd.DataFrame([0, 1]) + + df1["here"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame([[1, 2, 0], [5, 3, 1]], columns=["Test", "Test2", "here"]), + ) + + def test_pandas_set_item_multiIndex(self): + df1 = pd.DataFrame(["Text 1", "Text 2"], columns=["Test"]) + df2 = pd.DataFrame( + [[3, 5], [8, 4]], columns=pd.MultiIndex.from_product([["count"], [0, 1]]), + ) + + df1["here"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame( + [["Text 1", 3, 5], ["Text 2", 8, 4]], + columns=pd.MultiIndex.from_tuples( + [("Test", ""), ("here", 0), ("here", 1)] + ), + ), + ) From e2768b543b76b5d4d496f82061c70e5481317d61 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 4 Sep 2020 17:04:45 +0200 Subject: [PATCH 13/18] implemented the suggested changes --- setup.cfg | 2 +- tests/test_representation.py | 23 ++++++++--------------- texthero/representation.py | 29 +++++++---------------------- 3 files changed, 16 insertions(+), 38 deletions(-) diff --git a/setup.cfg b/setup.cfg index d6103b02..cc082845 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = # TODO pick the correct version. [options.extras_require] dev = - black>=19.10b0 + black=19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 diff --git a/tests/test_representation.py b/tests/test_representation.py index 2722289e..e1db52e1 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -54,13 +54,6 @@ def _tfidf(term, corpus, document_index): s_tokenized_output_index_noncontinous = pd.Index([5, 7]) - -def _get_multiindex_for_tokenized_output(first_level_name): - return pd.MultiIndex.from_product( - [[first_level_name], ["!", ".", "?", "TEST", "Test"]] - ) - - test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above] [ @@ -69,7 +62,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [[1, 0, 0, 1, 2], [0, 2, 1, 0, 1]], index=s_tokenized_output_index, - columns=_get_multiindex_for_tokenized_output("count"), + columns=["!", ".", "?", "TEST", "Test"], ).astype("Sparse[int64, 0]"), ], [ @@ -78,7 +71,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [[0.125, 0.0, 0.0, 0.125, 0.250], [0.0, 0.25, 0.125, 0.0, 0.125]], index=s_tokenized_output_index, - columns=_get_multiindex_for_tokenized_output("term_frequency"), + columns=["!", ".", "?", "TEST", "Test"], dtype="Sparse", ).astype("Sparse[float64, nan]"), ], @@ -94,7 +87,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): [_tfidf(x, s_tokenized, 1) for x in ["!", ".", "?", "TEST", "Test"]], ], index=s_tokenized_output_index, - columns=_get_multiindex_for_tokenized_output("tfidf"), + columns=["!", ".", "?", "TEST", "Test"], ).astype("Sparse[float64, nan]"), ], ] @@ -108,7 +101,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [2, 1], index=s_tokenized_output_index, - columns=pd.MultiIndex.from_tuples([("count", "Test")]), + columns=["Test"], ).astype("Sparse[int64, 0]"), ], [ @@ -117,7 +110,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [0.666667, 0.333333], index=s_tokenized_output_index, - columns=pd.MultiIndex.from_tuples([("term_frequency", "Test")]), + columns=[ "Test"], ).astype("Sparse[float64, nan]"), ], [ @@ -126,7 +119,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): pd.DataFrame( [2, 1], index=s_tokenized_output_index, - columns=pd.MultiIndex.from_tuples([("tfidf", "Test")]), + columns= ["Test"], ).astype("Sparse[float64, nan]"), ], ] @@ -136,7 +129,7 @@ def _get_multiindex_for_tokenized_output(first_level_name): s_documenttermDF = pd.DataFrame( [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], - columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), + columns=["a", "b"], ).astype("Sparse[float64, nan]") @@ -279,7 +272,7 @@ def test_normalize_documenttermDF_also_as_output(self): correct_output = pd.DataFrame( [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], - columns=pd.MultiIndex.from_product([["test"], ["a", "b"]]), + columns= ["a", "b"], ) pd.testing.assert_frame_equal( diff --git a/texthero/representation.py b/texthero/representation.py index 8e876088..ce8aae9c 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -34,7 +34,7 @@ def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool: Returns true if input is Document Term DF, else False. """ - return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex) + return isinstance(df, pd.DataFrame) and not isinstance(df.columns, pd.MultiIndex) # Warning message for not-tokenized inputs @@ -97,8 +97,7 @@ def count( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) - >>> hero.count(s) # doctest: +SKIP - count + >>> hero.count(s) # doctest: +SKIP Sentence one two 0 1 1 0 1 1 0 1 @@ -126,12 +125,8 @@ def count( tf_vectors_csr = tf.fit_transform(s) - multiindexed_columns = pd.MultiIndex.from_tuples( - [("count", word) for word in tf.get_feature_names()] - ) - return pd.DataFrame.sparse.from_spmatrix( - tf_vectors_csr, s.index, multiindexed_columns + tf_vectors_csr, s.index, tf.get_feature_names() ) @@ -175,8 +170,7 @@ def term_frequency( >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) - >>> hero.term_frequency(s) # doctest: +SKIP - term_frequency + >>> hero.term_frequency(s) # doctest: +SKIP Sentence hey one two 0 0.2 0.2 0.2 0.0 1 0.2 0.0 0.0 0.2 @@ -204,12 +198,8 @@ def term_frequency( total_count_coo = np.sum(tf_vectors_coo) frequency_coo = np.divide(tf_vectors_coo, total_count_coo) - multiindexed_columns = pd.MultiIndex.from_tuples( - [("term_frequency", word) for word in tf.get_feature_names()] - ) - return pd.DataFrame.sparse.from_spmatrix( - frequency_coo, s.index, multiindexed_columns + frequency_coo, s.index, tf.get_feature_names() ) @@ -271,8 +261,7 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) - >>> hero.tfidf(s) # doctest: +SKIP - tfidf + >>> hero.tfidf(s) # doctest: +SKIP Bye Hi Test 0 1.0 1.405465 0.000000 1 2.0 0.000000 1.405465 @@ -301,12 +290,8 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram tfidf_vectors_csr = tfidf.fit_transform(s) - multiindexed_columns = pd.MultiIndex.from_tuples( - [("tfidf", word) for word in tfidf.get_feature_names()] - ) - return pd.DataFrame.sparse.from_spmatrix( - tfidf_vectors_csr, s.index, multiindexed_columns + tfidf_vectors_csr, s.index, tfidf.get_feature_names() ) From b09f6242c8b97865aaa5a23a9c52e2b1b1ffa4c2 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 4 Sep 2020 17:08:26 +0200 Subject: [PATCH 14/18] fixed messy docstring --- texthero/representation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index ce8aae9c..f7e2a01a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -134,11 +134,11 @@ def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, ) -> pd.DataFrame: """ - Represent a text-based Pandas Series using term frequency. + Return a count document-term DataFrame based on the given Pandas Series - Return a Document Term DataFrame with the - term frequencies of the terms for every - document. The output is sparse. + Rows of the returned DataFrame represent document whereas columns are terms. + The value in the cell document-term is the frequency of the term in + this document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will From 508c3617988302a898f6c623eba6b02e910b2aee Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 4 Sep 2020 17:18:49 +0200 Subject: [PATCH 15/18] fix black issues --- .travis.yml | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index f913f183..c76284b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ jobs: env: PATH=/c/Python38:/c/Python38/Scripts:$PATH install: - pip3 install --upgrade pip # all three OSes agree about 'pip3' - - pip3 install black + - pip3 install black==19.10b0 - pip3 install ".[dev]" . # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only diff --git a/setup.cfg b/setup.cfg index cc082845..3f86e7f3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = # TODO pick the correct version. [options.extras_require] dev = - black=19.10b0 + black==19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 From 75e955fbce511e353d3381a1cb55d1e826bf8fef Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Fri, 4 Sep 2020 17:19:25 +0200 Subject: [PATCH 16/18] fix formatting --- tests/test_representation.py | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index e1db52e1..ff821efb 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -98,38 +98,30 @@ def _tfidf(term, corpus, document_index): [ "count", representation.count, - pd.DataFrame( - [2, 1], - index=s_tokenized_output_index, - columns=["Test"], - ).astype("Sparse[int64, 0]"), + pd.DataFrame([2, 1], index=s_tokenized_output_index, columns=["Test"],).astype( + "Sparse[int64, 0]" + ), ], [ "term_frequency", representation.term_frequency, pd.DataFrame( - [0.666667, 0.333333], - index=s_tokenized_output_index, - columns=[ "Test"], + [0.666667, 0.333333], index=s_tokenized_output_index, columns=["Test"], ).astype("Sparse[float64, nan]"), ], [ "tfidf", representation.tfidf, - pd.DataFrame( - [2, 1], - index=s_tokenized_output_index, - columns= ["Test"], - ).astype("Sparse[float64, nan]"), + pd.DataFrame([2, 1], index=s_tokenized_output_index, columns=["Test"],).astype( + "Sparse[float64, nan]" + ), ], ] s_vector_series = pd.Series([[1.0, 0.0], [0.0, 0.0]], index=[5, 7]) s_documenttermDF = pd.DataFrame( - [[1.0, 0.0], [0.0, 0.0]], - index=[5, 7], - columns=["a", "b"], + [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"], ).astype("Sparse[float64, nan]") @@ -270,9 +262,7 @@ def test_normalize_documenttermDF_also_as_output(self): # input so we test it separately result = representation.normalize(s_documenttermDF) correct_output = pd.DataFrame( - [[1.0, 0.0], [0.0, 0.0]], - index=[5, 7], - columns= ["a", "b"], + [[1.0, 0.0], [0.0, 0.0]], index=[5, 7], columns=["a", "b"], ) pd.testing.assert_frame_equal( From 7bf35837513fb69e98d7dbcfb0c6dd7062325945 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Sat, 5 Sep 2020 17:08:45 +0200 Subject: [PATCH 17/18] apdated set_item to the new requierements --- tests/test_helpers.py | 8 +++----- texthero/_helper.py | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index d6197263..71792fc6 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -88,18 +88,16 @@ def test_pandas_set_item_normal(self): def test_pandas_set_item_multiIndex(self): df1 = pd.DataFrame(["Text 1", "Text 2"], columns=["Test"]) - df2 = pd.DataFrame( - [[3, 5], [8, 4]], columns=pd.MultiIndex.from_product([["count"], [0, 1]]), - ) + df2 = pd.DataFrame([[3, 5], [8, 4]], columns=["term 1", "term 2"],) - df1["here"] = df2 + df1["count"] = df2 pd.testing.assert_frame_equal( df1, pd.DataFrame( [["Text 1", 3, 5], ["Text 2", 8, 4]], columns=pd.MultiIndex.from_tuples( - [("Test", ""), ("here", 0), ("here", 1)] + [("Test", ""), ("count", "term 1"), ("count", "term 2")] ), ), ) diff --git a/texthero/_helper.py b/texthero/_helper.py index 91466028..f9d03a54 100644 --- a/texthero/_helper.py +++ b/texthero/_helper.py @@ -209,7 +209,7 @@ def _hero__setitem__(self, key, value): # 1. if ( isinstance(value, pd.DataFrame) - and isinstance(value.columns, pd.MultiIndex) + and len(value.columns) > 1 and isinstance(key, str) ): @@ -221,7 +221,7 @@ def _hero__setitem__(self, key, value): # 3. value.columns = pd.MultiIndex.from_tuples( - [(key, subcol_name) for _, subcol_name in value.columns.values] + [(key, subcol_name) for subcol_name in value.columns.values] ) # 4. From f44739e50c333f8ad5463bb61bbac8fa92f91461 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sat, 12 Sep 2020 15:05:21 +0200 Subject: [PATCH 18/18] Add tests for sparseness. Co-authored-by: Maximilian Krahn --- tests/test_helpers.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 71792fc6..eb97bde2 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -101,3 +101,39 @@ def test_pandas_set_item_multiIndex(self): ), ), ) + + def test_pandas_set_item_sparse_df1(self): + df1 = pd.DataFrame([[1, 2], [5, 3]], columns=["Test", "Test2"]).astype("Sparse") + df2 = pd.DataFrame([0, 1]) + + df1["here"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame([[1, 2, 0], [5, 3, 1]], columns=["Test", "Test2", "here"]), + check_dtype=False, + ) + + def test_pandas_set_item_sparse_df2(self): + df1 = pd.DataFrame([[1, 2], [5, 3]], columns=["Test", "Test2"]) + df2 = pd.DataFrame([0, 1]).astype("Sparse") + + df1["here"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame([[1, 2, 0], [5, 3, 1]], columns=["Test", "Test2", "here"]), + check_dtype=False, + ) + + def test_pandas_set_item_sparse_df1_and_df2(self): + df1 = pd.DataFrame([[1, 2], [5, 3]], columns=["Test", "Test2"]).astype("Sparse") + df2 = pd.DataFrame([0, 1]).astype("Sparse") + + df1["here"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame([[1, 2, 0], [5, 3, 1]], columns=["Test", "Test2", "here"]), + check_dtype=False, + )