diff --git a/.gitignore b/.gitignore index 1b8d1ce7..fd29d505 100644 --- a/.gitignore +++ b/.gitignore @@ -184,3 +184,4 @@ dmypy.json # Cython debug symbols cython_debug/ docs/source/api +.vscode/launch.json diff --git a/tests/test_helpers.py b/tests/test_helpers.py index a0e4a195..eb97bde2 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -74,3 +74,66 @@ def f(s): with warnings.catch_warnings(): warnings.simplefilter("ignore") self.assertTrue(f(s).index.equals(s_true.index)) + + def test_pandas_set_item_normal(self): + df1 = pd.DataFrame([[1, 2], [5, 3]], columns=["Test", "Test2"]) + df2 = pd.DataFrame([0, 1]) + + df1["here"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame([[1, 2, 0], [5, 3, 1]], columns=["Test", "Test2", "here"]), + ) + + def test_pandas_set_item_multiIndex(self): + df1 = pd.DataFrame(["Text 1", "Text 2"], columns=["Test"]) + df2 = pd.DataFrame([[3, 5], [8, 4]], columns=["term 1", "term 2"],) + + df1["count"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame( + [["Text 1", 3, 5], ["Text 2", 8, 4]], + columns=pd.MultiIndex.from_tuples( + [("Test", ""), ("count", "term 1"), ("count", "term 2")] + ), + ), + ) + + def test_pandas_set_item_sparse_df1(self): + df1 = pd.DataFrame([[1, 2], [5, 3]], columns=["Test", "Test2"]).astype("Sparse") + df2 = pd.DataFrame([0, 1]) + + df1["here"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame([[1, 2, 0], [5, 3, 1]], columns=["Test", "Test2", "here"]), + check_dtype=False, + ) + + def test_pandas_set_item_sparse_df2(self): + df1 = pd.DataFrame([[1, 2], [5, 3]], columns=["Test", "Test2"]) + df2 = pd.DataFrame([0, 1]).astype("Sparse") + + df1["here"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame([[1, 2, 0], [5, 3, 1]], columns=["Test", "Test2", "here"]), + check_dtype=False, + ) + + def test_pandas_set_item_sparse_df1_and_df2(self): + df1 = pd.DataFrame([[1, 2], [5, 3]], columns=["Test", "Test2"]).astype("Sparse") + df2 = pd.DataFrame([0, 1]).astype("Sparse") + + df1["here"] = df2 + + pd.testing.assert_frame_equal( + df1, + pd.DataFrame([[1, 2, 0], [5, 3, 1]], columns=["Test", "Test2", "here"]), + check_dtype=False, + ) diff --git a/texthero/__init__.py b/texthero/__init__.py index 66e891e9..8998dd32 100644 --- a/texthero/__init__.py +++ b/texthero/__init__.py @@ -16,3 +16,5 @@ from .nlp import * from . import stopwords + +from . import _helper diff --git a/texthero/_helper.py b/texthero/_helper.py index 6319c056..f9d03a54 100644 --- a/texthero/_helper.py +++ b/texthero/_helper.py @@ -71,3 +71,166 @@ def wrapper(*args, **kwargs): return wrapper return decorator + + +""" +Pandas Integration of DocumentTermDF + +It's really important that users can seamlessly integrate texthero's function +output with their code. Let's assume a user has his documents in a DataFrame +`df["texts"]` that looks like this: + +``` +>>> df = pd.DataFrame(["Text of doc 1", "Text of doc 2", "Text of doc 3"], columns=["text"]) +>>> df + text +0 Text of doc 1 +1 Text of doc 2 +2 Text of doc 3 + +``` + + Let's look at an example output that `hero.count` could +return with the DocumentTermDF: + +``` +>>> hero.count(df["text"]) + count + 1 2 3 Text doc of +0 1 0 0 1 1 1 +1 0 1 0 1 1 1 +2 0 0 1 1 1 1 +``` + +That's a DataFrame. Great! Of course, users can +just store this somewhere as e.g. `df_count = hero.count(df["texts"])`, +and that works great. Accessing is then also as always: to get the +count values, they can just do `df_count.values` and have the count matrix +right there! + +However, what we see really often is users wanting to do this: +`df["count"] = hero.count(df["texts"])`. This sadly does not work out +of the box. The reason is that this subcolumn type is implemented +internally through a _Multiindex in the columns_. So we have + +``` +>>> df.columns +Index(['text'], dtype='object') +>>> hero.count(df["texts"]).columns +MultiIndex([('count', '1'), + ('count', '2'), + ('count', '3'), + ('count', 'Text'), + ('count', 'doc'), + ('count', 'of')], + ) + +``` + +Pandas _cannot_ automatically combine these. So what we will +do is this: Calling `df["count"] = hero.count(df["texts"])` is +internally this: `pd.DataFrame.__setitem__(self=df, key="count", value=hero.count(df["texts"]))`. +We will overwrite this method so that if _self_ is not multiindexed yet +and _value_ is multiindexed, we transform _self_ (so `df` here) to +be multiindexed and we can then easily integrate our column-multiindexed output from texthero. +See the implementation below for details. + +Additionally, we support this for pd.concat in a similar way; again, see the +implementation below for details. + +Advantages / Why does this work? + + - we don't destroy any pandas functionality as currently calling + `__setitem__` with a Multiindexed value is just not possible, so + our changes to Pandas do not break any Pandas functionality for + the users. We're only _expanding_ the functinoality + + - after multiindexing, users can still access their + "normal" columns like before; e.g. `df["texts"]` will + behave the same way as before even though it is now internally + multiindexed as `MultiIndex([('text', ''), ('count', '1'), + ('count', '2'), + ('count', '3'), + ('count', 'Text'), + ('count', 'doc'), + ('count', 'of')], + )`. + +Disadvantage: + + - poor performance, so we discurage user from using it, but we still want to support it +""" + +# Store the original __setitem__ function as _original__setitem__ +_pd_original__setitem__ = pd.DataFrame.__setitem__ +pd.DataFrame._original__setitem__ = _pd_original__setitem__ + + +# Define a new __setitem__ function that will replace pd.DataFrame.__setitem__ +def _hero__setitem__(self, key, value): + """ + Called when doing self["key"] = value. + E.g. df["count"] = hero.count(df["texts"]) is internally doing + pd.DataFrame.__setitem__(self=df, key="count", value=hero.count(df["texts"]). + + So self is df, key is the new column's name, value is + what we want to put into the new column. + + What we do: + + 1. If user calls __setitem__ with value being multiindexed, e.g. + df["count"] = hero.count(df["texts"]), + so __setitem__(self=df, key="count", value=hero.count(df["texts"]) + + 2. we make self multiindexed if it isn't already + -> e.g. column "text" internally becomes multiindexed + to ("text", "") but users do _not_ notice this. + This is a very quick operation that does not need + to look at the df's values, we just reassign + self.columns + + 3. we change value's columns so the first level is named `key` + -> e.g. a user might do df["haha"] = hero.count(df["texts"]), + so just doing df[hero.count(df["texts"]).columns] = hero.count(df["texts"]) + would give him a new column that is named like texthero's output, + e.g. "count" instead of "haha". So we internally rename the + value columns (e.g. ('haha', '1'), + ('haha', '2'), + ('haha', '3'), + ('haha', 'Text'), + ('haha', 'doc'), + ('haha', 'of')]]) + + 4. we do self[value.columns] = value as that's exactly the command + that correctly integrates the multiindexed `value` into `self` + + """ + + # 1. + if ( + isinstance(value, pd.DataFrame) + and len(value.columns) > 1 + and isinstance(key, str) + ): + + # 2. + if not isinstance(self.columns, pd.MultiIndex): + self.columns = pd.MultiIndex.from_tuples( + [(col_name, "") for col_name in self.columns.values] + ) + + # 3. + value.columns = pd.MultiIndex.from_tuples( + [(key, subcol_name) for subcol_name in value.columns.values] + ) + + # 4. + self[value.columns] = value + + else: + + self._original__setitem__(key, value) + + +# Replace __setitem__ with our custom function +pd.DataFrame.__setitem__ = _hero__setitem__