diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 56fd71e19f..1086e1e1aa 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -10898,6 +10898,74 @@ def from_dict(data, orient="columns", dtype=None, columns=None) -> "DataFrame": """ return DataFrame(pd.DataFrame.from_dict(data, orient=orient, dtype=dtype, columns=columns)) + def lookup(self, row_labels, col_labels) -> np.ndarray: + """ + Label-based "fancy indexing" function for DataFrame. + + Given equal-length arrays of row and column labels, return an + array of the values corresponding to each (row, col) pair. + + `row_labels` and `col_labels` are not support the type `Series` and `Index` + to prevent performance degradation. + + Parameters + ---------- + row_labels : sequence + The row labels to use for lookup. + col_labels : sequence + The column labels to use for lookup. + + Returns + ------- + numpy.ndarray + The found values. + + Examples + -------- + >>> kdf = ks.DataFrame({'A': [3, 4, 5, 6, 7], + ... 'B': [10.0, 20.0, 30.0, 40.0, 50.0], + ... 'C': ['a', 'b', 'c', 'd', 'e']}) + >>> kdf + A B C + 0 3 10.0 a + 1 4 20.0 b + 2 5 30.0 c + 3 6 40.0 d + 4 7 50.0 e + + >>> kdf.lookup([0], ["C"]) + array(['a'], dtype=object) + + >>> kdf.lookup([2, 3], ["A", "B"]) + array([ 5., 40.]) + """ + from databricks.koalas.series import Series + from databricks.koalas.indexes import Index, MultiIndex + + if isinstance(row_labels, (Series, Index)): + raise TypeError( + "'row_labels' doesn't support type '{}'.".format(type(row_labels).__name__) + ) + if isinstance(col_labels, (Series, Index)): + raise TypeError( + "'col_labels' doesn't support type '{}'.".format(type(col_labels).__name__) + ) + + if not isinstance(self.index, MultiIndex): + return ( + self.loc[list(set(row_labels)), list(set(col_labels))] + .to_pandas() + .lookup(row_labels, col_labels) + ) + else: + if len(row_labels) != len(col_labels): + raise ValueError("Row labels must have same size as column labels") + lookups = [ + self.loc[row_label, col_label] + for row_label, col_label in zip(row_labels, col_labels) + ] + return np.asarray(pd.Series(lookups)) + def _to_internal_pandas(self): """ Return a pandas DataFrame directly from _internal to avoid overhead of copy. diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py index 5333e664c8..30fcc5b42f 100644 --- a/databricks/koalas/missing/frame.py +++ b/databricks/koalas/missing/frame.py @@ -53,7 +53,6 @@ class _MissingPandasLikeDataFrame(object): insert = _unsupported_function("insert") interpolate = _unsupported_function("interpolate") last = _unsupported_function("last") - lookup = _unsupported_function("lookup") mode = _unsupported_function("mode") reorder_levels = _unsupported_function("reorder_levels") resample = _unsupported_function("resample") diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 2482c24caa..02372f38dd 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -5051,6 +5051,63 @@ def test_from_dict(self): kdf = ks.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]) self.assert_eq(pdf, kdf) + def test_lookup(self): + pdf = pd.DataFrame( + { + "A": [3, 4, 5, 6, 7], + "B": [10.0, 20.0, 30.0, 40.0, 50.0], + "C": ["a", "b", "c", "d", "e"], + } + ) + kdf = ks.from_pandas(pdf) + + # list + self.assert_eq(pdf.lookup([0], ["C"]), kdf.lookup([0], ["C"])) + self.assert_list_eq( + pdf.lookup([0, 3, 4], ["A", "C", "A"]), kdf.lookup([0, 3, 4], ["A", "C", "A"]) + ) + + # tuple + self.assert_eq(pdf.lookup((0,), ("C",)), kdf.lookup((0,), ("C",))) + self.assert_list_eq( + pdf.lookup((0, 3, 4), ("A", "C", "A")), kdf.lookup((0, 3, 4), ("A", "C", "A")) + ) + + # dict + self.assert_eq(pdf.lookup({0: None}, {"C": None}), kdf.lookup({0: None}, {"C": None})) + self.assert_list_eq( + pdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}), + kdf.lookup({0: None, 3: None, 4: None}, {"A": None, "C": None, "B": None}), + ) + + # MultiIndex + pdf.index = pd.MultiIndex.from_tuples( + [("a", "v"), ("b", "w"), ("c", "x"), ("d", "y"), ("e", "z")] + ) + kdf = ks.from_pandas(pdf) + + self.assert_eq(pdf.lookup([("a", "v")], ["C"]), kdf.lookup([("a", "v")], ["C"])) + self.assert_list_eq( + pdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]), + kdf.lookup([("a", "v"), ("d", "y"), ("e", "z")], ["A", "C", "A"]), + ) + + err_msg = "Row labels must have same size as column labels" + with self.assertRaisesRegex(ValueError, err_msg): + kdf.lookup([0, 3, 4], ["A", "C"]) + err_msg = "'row_labels' doesn't support type 'Index'." + with self.assertRaisesRegex(TypeError, err_msg): + kdf.lookup(ks.Index([0]), ["C"]) + err_msg = "'row_labels' doesn't support type 'Series'." + with self.assertRaisesRegex(TypeError, err_msg): + kdf.lookup(ks.Series([0]), ["C"]) + err_msg = "'col_labels' doesn't support type 'Index'." + with self.assertRaisesRegex(TypeError, err_msg): + kdf.lookup([0], ks.Index(["C"])) + err_msg = "'col_labels' doesn't support type 'Series'." + with self.assertRaisesRegex(TypeError, err_msg): + kdf.lookup([0], ks.Series(["C"])) + def test_pad(self): pdf = pd.DataFrame( { diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst index 613e9cbf1b..a1d3f8f06d 100644 --- a/docs/source/reference/frame.rst +++ b/docs/source/reference/frame.rst @@ -62,6 +62,7 @@ Indexing, iteration DataFrame.items DataFrame.iteritems DataFrame.iterrows + DataFrame.lookup DataFrame.itertuples DataFrame.keys DataFrame.pop