From d86c3acecf5a5cc9c9df6dfd0b869daaa442ef90 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 12 Jun 2024 06:15:39 +0000 Subject: [PATCH 1/8] Add more supported type annotations, fix spark connect issue --- .devcontainer/devcontainer.json | 2 +- fugue/dataframe/function_wrapper.py | 29 +++++++++---------- fugue_spark/_utils/misc.py | 2 +- fugue_version/__init__.py | 2 +- .../fugue/dataframe/test_function_wrapper.py | 14 ++++++++- 5 files changed, 29 insertions(+), 20 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 55677d0a..afd53921 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -38,7 +38,7 @@ ], "postCreateCommand": "make devenv", "features": { - "ghcr.io/devcontainers/features/docker-in-docker:2": {}, + "ghcr.io/devcontainers/features/docker-in-docker:2.11.0": {}, "ghcr.io/devcontainers/features/java:1": { "version": "11" } diff --git a/fugue/dataframe/function_wrapper.py b/fugue/dataframe/function_wrapper.py index 342d103c..f6571f22 100644 --- a/fugue/dataframe/function_wrapper.py +++ b/fugue/dataframe/function_wrapper.py @@ -20,6 +20,7 @@ PositionalParam, function_wrapper, ) +from triad.utils.convert import compare_annotations from triad.utils.iter import EmptyAwareIterable, make_empty_aware from ..constants import FUGUE_ENTRYPOINT @@ -37,6 +38,14 @@ from .pandas_dataframe import PandasDataFrame +def _compare_iter(tp: Any) -> Any: + return lambda x: compare_annotations( + x, Iterable[tp] # type:ignore + ) or compare_annotations( + x, Iterator[tp] # type:ignore + ) + + @function_wrapper(FUGUE_ENTRYPOINT) class DataFrameFunctionWrapper(FunctionWrapper): @property @@ -228,10 +237,7 @@ def count(self, df: List[List[Any]]) -> int: return len(df) -@fugue_annotated_param( - Iterable[List[Any]], - matcher=lambda x: x == Iterable[List[Any]] or x == Iterator[List[Any]], -) +@fugue_annotated_param(Iterable[List[Any]], matcher=_compare_iter(List[Any])) class _IterableListParam(_LocalNoSchemaDataFrameParam): @no_type_check def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[List[Any]]: @@ -288,10 +294,7 @@ def count(self, df: List[Dict[str, Any]]) -> int: return len(df) -@fugue_annotated_param( - Iterable[Dict[str, Any]], - matcher=lambda x: x == Iterable[Dict[str, Any]] or x == Iterator[Dict[str, Any]], -) +@fugue_annotated_param(Iterable[Dict[str, Any]], matcher=_compare_iter(Dict[str, Any])) class _IterableDictParam(_LocalNoSchemaDataFrameParam): @no_type_check def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[Dict[str, Any]]: @@ -360,10 +363,7 @@ def format_hint(self) -> Optional[str]: return "pandas" -@fugue_annotated_param( - Iterable[pd.DataFrame], - matcher=lambda x: x == Iterable[pd.DataFrame] or x == Iterator[pd.DataFrame], -) +@fugue_annotated_param(Iterable[pd.DataFrame], matcher=_compare_iter(pd.DataFrame)) class _IterablePandasParam(LocalDataFrameParam): @no_type_check def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pd.DataFrame]: @@ -419,10 +419,7 @@ def format_hint(self) -> Optional[str]: return "pyarrow" -@fugue_annotated_param( - Iterable[pa.Table], - matcher=lambda x: x == Iterable[pa.Table] or x == Iterator[pa.Table], -) +@fugue_annotated_param(Iterable[pa.Table], matcher=_compare_iter(pa.Table)) class _IterableArrowParam(LocalDataFrameParam): @no_type_check def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pa.Table]: diff --git a/fugue_spark/_utils/misc.py b/fugue_spark/_utils/misc.py index 78b28041..37ea7e79 100644 --- a/fugue_spark/_utils/misc.py +++ b/fugue_spark/_utils/misc.py @@ -3,7 +3,7 @@ try: from pyspark.sql.connect.session import SparkSession as SparkConnectSession from pyspark.sql.connect.dataframe import DataFrame as SparkConnectDataFrame -except ImportError: # pragma: no cover +except Exception: # pragma: no cover SparkConnectSession = None SparkConnectDataFrame = None import pyspark.sql as ps diff --git a/fugue_version/__init__.py b/fugue_version/__init__.py index 3e2f46a3..d69d16e9 100644 --- a/fugue_version/__init__.py +++ b/fugue_version/__init__.py @@ -1 +1 @@ -__version__ = "0.9.0" +__version__ = "0.9.1" diff --git a/tests/fugue/dataframe/test_function_wrapper.py b/tests/fugue/dataframe/test_function_wrapper.py index fa634920..4cced3e9 100644 --- a/tests/fugue/dataframe/test_function_wrapper.py +++ b/tests/fugue/dataframe/test_function_wrapper.py @@ -26,10 +26,14 @@ ) from fugue.dataframe.utils import _df_eq as df_eq from fugue.dev import DataFrameFunctionWrapper +import sys def test_function_wrapper(): - for f in [f20, f21, f212, f22, f23, f24, f25, f26, f30, f31, f32, f35, f36]: + fs = [f20, f21, f212, f22, f23, f24, f25, f26, f30, f31, f32, f35, f36] + if sys.version_info >= (3, 9): + fs.append(f33) + for f in fs: df = ArrayDataFrame([[0]], "a:int") w = DataFrameFunctionWrapper(f, "^[ldsp][ldsp]$", "[ldspq]") res = w.run([df], dict(a=df), ignore_unknown=False, output_schema="a:int") @@ -372,6 +376,14 @@ def f32( return ArrayDataFrame(arr, "a:int").as_dict_iterable() +def f33( + e: list[dict[str, Any]], a: Iterable[dict[str, Any]] +) -> EmptyAwareIterable[Dict[str, Any]]: + e += list(a) + arr = [[x["a"]] for x in e] + return ArrayDataFrame(arr, "a:int").as_dict_iterable() + + def f35(e: pd.DataFrame, a: LocalDataFrame) -> Iterable[pd.DataFrame]: e = PandasDataFrame(e, "a:int").as_pandas() a = ArrayDataFrame(a, "a:int").as_pandas() From 20c30ddf9dc50d013e9311b63b3d9f23f37775a1 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 12 Jun 2024 06:19:08 +0000 Subject: [PATCH 2/8] update --- setup.py | 2 +- tests/fugue/dataframe/test_function_wrapper.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5386b698..cd311456 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ def get_version() -> str: keywords="distributed spark dask ray sql dsl domain specific language", url="http://github.com/fugue-project/fugue", install_requires=[ - "triad>=0.9.6", + "triad>=0.9.7", "adagio>=0.2.4", ], extras_require={ diff --git a/tests/fugue/dataframe/test_function_wrapper.py b/tests/fugue/dataframe/test_function_wrapper.py index 4cced3e9..6c79806a 100644 --- a/tests/fugue/dataframe/test_function_wrapper.py +++ b/tests/fugue/dataframe/test_function_wrapper.py @@ -1,4 +1,7 @@ +from __future__ import annotations + import copy +import sys from typing import Any, Dict, Iterable, Iterator, List import pandas as pd @@ -26,7 +29,6 @@ ) from fugue.dataframe.utils import _df_eq as df_eq from fugue.dev import DataFrameFunctionWrapper -import sys def test_function_wrapper(): From 370025ea2bff075f77f82a674714b5581f82434b Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 12 Jun 2024 06:23:43 +0000 Subject: [PATCH 3/8] update --- .github/workflows/test_all.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_all.yml b/.github/workflows/test_all.yml index 095a99d8..3a8ef8c9 100644 --- a/.github/workflows/test_all.yml +++ b/.github/workflows/test_all.yml @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8, "3.10"] # TODO: add back 3.11 when dask-sql is compatible + python-version: [3.8, "3.10", "3.11"] steps: - uses: actions/checkout@v2 From de007060df69b425a8e410aa47ac56aac1ef1b22 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 12 Jun 2024 06:44:36 +0000 Subject: [PATCH 4/8] update --- tests/fugue_duckdb/test_dataframe.py | 4 +- tests/fugue_duckdb/test_utils.py | 2 +- x.txt | 2868 ++++++++++++++++++++++++++ 3 files changed, 2871 insertions(+), 3 deletions(-) create mode 100644 x.txt diff --git a/tests/fugue_duckdb/test_dataframe.py b/tests/fugue_duckdb/test_dataframe.py index 55c166ca..4a9ff812 100644 --- a/tests/fugue_duckdb/test_dataframe.py +++ b/tests/fugue_duckdb/test_dataframe.py @@ -15,7 +15,7 @@ class DuckDataFrameTests(DataFrameTests.Tests): def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame: df = ArrowDataFrame(data, schema) - return DuckDataFrame(duckdb.from_arrow(df.native, self.context.session)) + return DuckDataFrame(duckdb.from_arrow(df.native, connection=self.context.session)) def test_as_array_special_values(self): for func in [ @@ -69,7 +69,7 @@ def test_duck_as_local(self): class NativeDuckDataFrameTests(DataFrameTests.NativeTests): def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame: df = ArrowDataFrame(data, schema) - return DuckDataFrame(duckdb.from_arrow(df.native, self.context.session)).native + return DuckDataFrame(duckdb.from_arrow(df.native, connection=self.context.session)).native def to_native_df(self, pdf: pd.DataFrame) -> Any: return duckdb.from_df(pdf) diff --git a/tests/fugue_duckdb/test_utils.py b/tests/fugue_duckdb/test_utils.py index c8ad9090..9e471d57 100644 --- a/tests/fugue_duckdb/test_utils.py +++ b/tests/fugue_duckdb/test_utils.py @@ -45,7 +45,7 @@ def test_type_conversion(backend_context): def assert_(tp): dt = duckdb.from_arrow( - pa.Table.from_pydict(dict(a=pa.nulls(2, tp))), con + pa.Table.from_pydict(dict(a=pa.nulls(2, tp))), connection=con ).types[0] assert to_pa_type(dt) == tp dt = to_duck_type(tp) diff --git a/x.txt b/x.txt new file mode 100644 index 00000000..665d9bf3 --- /dev/null +++ b/x.txt @@ -0,0 +1,2868 @@ +============================= test session starts ============================== +platform linux -- Python 3.10.14, pytest-8.2.1, pluggy-1.5.0 -- /usr/local/py-utils/venvs/pytest/bin/python +cachedir: .pytest_cache +Fugue tests will be initialized with options: + fugue.test.dummy = dummy + fugue.test = True + ray.num_cpus = 2 + dask.processes = True + dask.n_workers = 3 + dask.threads_per_worker = 1 +rootdir: /workspaces/fugue +configfile: setup.cfg +plugins: cov-5.0.0, rerunfailures-10.2, fugue-0.9.1, mock-3.14.0, anyio-4.4.0 +collecting ... collected 1 item + +tests/fugue_dask/test_execution_engine.py::DaskExecutionEngineTests::test_select[dask] <- ../../home/vscode/.local/lib/python3.10/site-packages/fugue_test/execution_suite.py FAILED [100%] + +=================================== FAILURES =================================== +__________________ DaskExecutionEngineTests.test_select[dask] __________________ + +self = + + @pytest.mark.skipif(not HAS_QPD, reason="qpd not working") + def test_select(self): + a = ArrayDataFrame( + [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" + ) + + # simple + b = fa.select(a, col("b"), (col("b") + 1).alias("c").cast(str)) + self.df_eq( + b, + [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]], + "b:int,c:str", + throw=True, + ) + + # with distinct + b = fa.select( + a, col("b"), (col("b") + 1).alias("c").cast(str), distinct=True + ) + self.df_eq( + b, + [[2, "3"], [1, "2"], [4, "5"]], + "b:int,c:str", + throw=True, + ) + + # wildcard + b = fa.select(a, all_cols(), where=col("a") + col("b") == 3) + self.df_eq(b, [[1, 2]], "a:double,b:int", throw=True) + + # aggregation + b = fa.select(a, col("a"), ff.sum(col("b")).cast(float).alias("b")) + self.df_eq(b, [[1, 2], [3, 4], [None, 7]], "a:double,b:double", throw=True) + + # having + # https://github.com/fugue-project/fugue/issues/222 + col_b = ff.sum(col("b")) + b = fa.select( + a, + col("a"), + col_b.cast(float).alias("c"), + having=(col_b >= 7) | (col("a") == 1), + ) +> self.df_eq(b, [[1, 2], [None, 7]], "a:double,c:double", throw=True) + +/home/vscode/.local/lib/python3.10/site-packages/fugue_test/execution_suite.py:139: +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +/home/vscode/.local/lib/python3.10/site-packages/fugue/test/plugins.py:191: in df_eq + return _df_eq(*args, **kwargs) +/home/vscode/.local/lib/python3.10/site-packages/fugue/dataframe/utils.py:58: in _df_eq + df1 = as_fugue_df(df).as_local_bounded() +fugue_dask/dataframe.py:110: in as_local_bounded + res = PandasDataFrame(self.as_pandas(), self.schema) +fugue_dask/dataframe.py:148: in as_pandas + pdf = self.native.compute().reset_index(drop=True) +/home/vscode/.local/lib/python3.10/site-packages/dask_expr/_collection.py:476: in compute + return DaskMethodsMixin.compute(out, **kwargs) +/home/vscode/.local/lib/python3.10/site-packages/dask/base.py:375: in compute + (result,) = compute(self, traverse=False, **kwargs) +/home/vscode/.local/lib/python3.10/site-packages/dask/base.py:661: in compute + results = schedule(dsk, keys, **kwargs) +/home/vscode/.local/lib/python3.10/site-packages/dask_expr/_expr.py:3727: in _execute_task + return dask.core.get(graph, name) +_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ + + from __future__ import annotations + + from contextlib import suppress + import sys + from typing import ( + TYPE_CHECKING, + cast, + final, + ) + import warnings + + import numpy as np + + from pandas._config import using_copy_on_write + + from pandas._libs.indexing import NDFrameIndexerBase + from pandas._libs.lib import item_from_zerodim + from pandas.compat import PYPY + from pandas.errors import ( + AbstractMethodError, + ChainedAssignmentError, + IndexingError, + InvalidIndexError, + LossySetitemError, + _chained_assignment_msg, + ) + from pandas.util._decorators import doc + + from pandas.core.dtypes.cast import ( + can_hold_element, + maybe_promote, + ) + from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, + is_sequence, + ) + from pandas.core.dtypes.concat import concat_compat + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, + ) + from pandas.core.dtypes.missing import ( + infer_fill_value, + is_valid_na_for_dtype, + isna, + na_value_for_dtype, + ) + + from pandas.core import algorithms as algos + import pandas.core.common as com + from pandas.core.construction import ( + array as pd_array, + extract_array, + ) + from pandas.core.indexers import ( + check_array_indexer, + is_list_like_indexer, + is_scalar_indexer, + length_of_indexer, + ) + from pandas.core.indexes.api import ( + Index, + MultiIndex, + ) + + if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Sequence, + ) + + from pandas._typing import ( + Axis, + AxisInt, + Self, + ) + + from pandas import ( + DataFrame, + Series, + ) + + # "null slice" + _NS = slice(None, None) + _one_ellipsis_message = "indexer may only contain one '...' entry" + + + # the public IndexSlicerMaker + class _IndexSlice: + """ + Create an object to more easily perform multi-index slicing. + + See Also + -------- + MultiIndex.remove_unused_levels : New MultiIndex with no unused levels. + + Notes + ----- + See :ref:`Defined Levels ` + for further info on slicing a MultiIndex. + + Examples + -------- + >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) + >>> columns = ['foo', 'bar'] + >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), + ... index=midx, columns=columns) + + Using the default slice command: + + >>> dfmi.loc[(slice(None), slice('B0', 'B1')), :] + foo bar + A0 B0 0 1 + B1 2 3 + A1 B0 8 9 + B1 10 11 + + Using the IndexSlice class for a more intuitive command: + + >>> idx = pd.IndexSlice + >>> dfmi.loc[idx[:, 'B0':'B1'], :] + foo bar + A0 B0 0 1 + B1 2 3 + A1 B0 8 9 + B1 10 11 + """ + + def __getitem__(self, arg): + return arg + + + IndexSlice = _IndexSlice() + + + class IndexingMixin: + """ + Mixin for adding .loc/.iloc/.at/.iat to Dataframes and Series. + """ + + @property + def iloc(self) -> _iLocIndexer: + """ + Purely integer-location based indexing for selection by position. + + ``.iloc[]`` is primarily integer position based (from ``0`` to + ``length-1`` of the axis), but may also be used with a boolean + array. + + Allowed inputs are: + + - An integer, e.g. ``5``. + - A list or array of integers, e.g. ``[4, 3, 0]``. + - A slice object with ints, e.g. ``1:7``. + - A boolean array. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above). + This is useful in method chains, when you don't have a reference to the + calling object, but would like to base your selection on some value. + - A tuple of row and column indexes. The tuple elements consist of one of the + above inputs, e.g. ``(0, 1)``. + + ``.iloc`` will raise ``IndexError`` if a requested indexer is + out-of-bounds, except *slice* indexers which allow out-of-bounds + indexing (this conforms with python/numpy *slice* semantics). + + See more at :ref:`Selection by Position `. + + See Also + -------- + DataFrame.iat : Fast integer location scalar accessor. + DataFrame.loc : Purely label-location based indexer for selection by label. + Series.iloc : Purely integer-location based indexing for + selection by position. + + Examples + -------- + >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, + ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, + ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] + >>> df = pd.DataFrame(mydict) + >>> df + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + **Indexing just the rows** + + With a scalar integer. + + >>> type(df.iloc[0]) + + >>> df.iloc[0] + a 1 + b 2 + c 3 + d 4 + Name: 0, dtype: int64 + + With a list of integers. + + >>> df.iloc[[0]] + a b c d + 0 1 2 3 4 + >>> type(df.iloc[[0]]) + + + >>> df.iloc[[0, 1]] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + + With a `slice` object. + + >>> df.iloc[:3] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + With a boolean mask the same length as the index. + + >>> df.iloc[[True, False, True]] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + With a callable, useful in method chains. The `x` passed + to the ``lambda`` is the DataFrame being sliced. This selects + the rows whose index label even. + + >>> df.iloc[lambda x: x.index % 2 == 0] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + **Indexing both axes** + + You can mix the indexer types for the index and columns. Use ``:`` to + select the entire axis. + + With scalar integers. + + >>> df.iloc[0, 1] + 2 + + With lists of integers. + + >>> df.iloc[[0, 2], [1, 3]] + b d + 0 2 4 + 2 2000 4000 + + With `slice` objects. + + >>> df.iloc[1:3, 0:3] + a b c + 1 100 200 300 + 2 1000 2000 3000 + + With a boolean array whose length matches the columns. + + >>> df.iloc[:, [True, False, True, False]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + + With a callable function that expects the Series or DataFrame. + + >>> df.iloc[:, lambda df: [0, 2]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + """ + return _iLocIndexer("iloc", self) + + @property + def loc(self) -> _LocIndexer: + """ + Access a group of rows and columns by label(s) or a boolean array. + + ``.loc[]`` is primarily label based, but may also be used with a + boolean array. + + Allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is + interpreted as a *label* of the index, and **never** as an + integer position along the index). + - A list or array of labels, e.g. ``['a', 'b', 'c']``. + - A slice object with labels, e.g. ``'a':'f'``. + + .. warning:: Note that contrary to usual python slices, **both** the + start and the stop are included + + - A boolean array of the same length as the axis being sliced, + e.g. ``[True, False, True]``. + - An alignable boolean Series. The index of the key will be aligned before + masking. + - An alignable Index. The Index of the returned selection will be the input. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above) + + See more at :ref:`Selection by Label `. + + Raises + ------ + KeyError + If any items are not found. + IndexingError + If an indexed key is passed and its index is unalignable to the frame index. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.iloc : Access group of rows and columns by integer position(s). + DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the + Series/DataFrame. + Series.loc : Access group of values using labels. + + Examples + -------- + **Getting values** + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) + >>> df + max_speed shield + cobra 1 2 + viper 4 5 + sidewinder 7 8 + + Single label. Note this returns the row as a Series. + + >>> df.loc['viper'] + max_speed 4 + shield 5 + Name: viper, dtype: int64 + + List of labels. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[['viper', 'sidewinder']] + max_speed shield + viper 4 5 + sidewinder 7 8 + + Single label for row and column + + >>> df.loc['cobra', 'shield'] + 2 + + Slice with labels for row and single label for column. As mentioned + above, note that both the start and stop of the slice are included. + + >>> df.loc['cobra':'viper', 'max_speed'] + cobra 1 + viper 4 + Name: max_speed, dtype: int64 + + Boolean list with the same length as the row axis + + >>> df.loc[[False, False, True]] + max_speed shield + sidewinder 7 8 + + Alignable boolean Series: + + >>> df.loc[pd.Series([False, True, False], + ... index=['viper', 'sidewinder', 'cobra'])] + max_speed shield + sidewinder 7 8 + + Index (same behavior as ``df.reindex``) + + >>> df.loc[pd.Index(["cobra", "viper"], name="foo")] + max_speed shield + foo + cobra 1 2 + viper 4 5 + + Conditional that returns a boolean Series + + >>> df.loc[df['shield'] > 6] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series with column labels specified + + >>> df.loc[df['shield'] > 6, ['max_speed']] + max_speed + sidewinder 7 + + Multiple conditional using ``&`` that returns a boolean Series + + >>> df.loc[(df['max_speed'] > 1) & (df['shield'] < 8)] + max_speed shield + viper 4 5 + + Multiple conditional using ``|`` that returns a boolean Series + + >>> df.loc[(df['max_speed'] > 4) | (df['shield'] < 5)] + max_speed shield + cobra 1 2 + sidewinder 7 8 + + Please ensure that each condition is wrapped in parentheses ``()``. + See the :ref:`user guide` + for more details and explanations of Boolean indexing. + + .. note:: + If you find yourself using 3 or more conditionals in ``.loc[]``, + consider using :ref:`advanced indexing`. + + See below for using ``.loc[]`` on MultiIndex DataFrames. + + Callable that returns a boolean Series + + >>> df.loc[lambda df: df['shield'] == 8] + max_speed shield + sidewinder 7 8 + + **Setting values** + + Set value for all items matching the list of labels + + >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 + >>> df + max_speed shield + cobra 1 2 + viper 4 50 + sidewinder 7 50 + + Set value for an entire row + + >>> df.loc['cobra'] = 10 + >>> df + max_speed shield + cobra 10 10 + viper 4 50 + sidewinder 7 50 + + Set value for an entire column + + >>> df.loc[:, 'max_speed'] = 30 + >>> df + max_speed shield + cobra 30 10 + viper 30 50 + sidewinder 30 50 + + Set value for rows matching callable condition + + >>> df.loc[df['shield'] > 35] = 0 + >>> df + max_speed shield + cobra 30 10 + viper 0 0 + sidewinder 0 0 + + Add value matching location + + >>> df.loc["viper", "shield"] += 5 + >>> df + max_speed shield + cobra 30 10 + viper 0 5 + sidewinder 0 0 + + Setting using a ``Series`` or a ``DataFrame`` sets the values matching the + index labels, not the index positions. + + >>> shuffled_df = df.loc[["viper", "cobra", "sidewinder"]] + >>> df.loc[:] += shuffled_df + >>> df + max_speed shield + cobra 60 20 + viper 0 10 + sidewinder 0 0 + + **Getting values on a DataFrame with an index that has integer labels** + + Another example using integers for the index + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=[7, 8, 9], columns=['max_speed', 'shield']) + >>> df + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + Slice with integer labels for rows. As mentioned above, note that both + the start and stop of the slice are included. + + >>> df.loc[7:9] + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + **Getting values with a MultiIndex** + + A number of examples using a DataFrame with a MultiIndex + + >>> tuples = [ + ... ('cobra', 'mark i'), ('cobra', 'mark ii'), + ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), + ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ] + >>> index = pd.MultiIndex.from_tuples(tuples) + >>> values = [[12, 2], [0, 4], [10, 20], + ... [1, 4], [7, 1], [16, 36]] + >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) + >>> df + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Single label. Note this returns a DataFrame with a single index. + + >>> df.loc['cobra'] + max_speed shield + mark i 12 2 + mark ii 0 4 + + Single index tuple. Note this returns a Series. + + >>> df.loc[('cobra', 'mark ii')] + max_speed 0 + shield 4 + Name: (cobra, mark ii), dtype: int64 + + Single label for row and column. Similar to passing in a tuple, this + returns a Series. + + >>> df.loc['cobra', 'mark i'] + max_speed 12 + shield 2 + Name: (cobra, mark i), dtype: int64 + + Single tuple. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[[('cobra', 'mark ii')]] + max_speed shield + cobra mark ii 0 4 + + Single tuple for the index with a single label for the column + + >>> df.loc[('cobra', 'mark i'), 'shield'] + 2 + + Slice from index tuple to single label + + >>> df.loc[('cobra', 'mark i'):'viper'] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Slice from index tuple to index tuple + + >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + + Please see the :ref:`user guide` + for more details and explanations of advanced indexing. + """ + return _LocIndexer("loc", self) + + @property + def at(self) -> _AtIndexer: + """ + Access a single value for a row/column label pair. + + Similar to ``loc``, in that both provide label-based lookups. Use + ``at`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + KeyError + * If getting a value and 'label' does not exist in a DataFrame or + Series. + ValueError + * If row/column label pair is not a tuple or if any label from + the pair is not a scalar for DataFrame. + * If label is list-like (*excluding* NamedTuple) for Series. + + See Also + -------- + DataFrame.at : Access a single value for a row/column pair by label. + DataFrame.iat : Access a single value for a row/column pair by integer + position. + DataFrame.loc : Access a group of rows and columns by label(s). + DataFrame.iloc : Access a group of rows and columns by integer + position(s). + Series.at : Access a single value by label. + Series.iat : Access a single value by integer position. + Series.loc : Access a group of rows by label(s). + Series.iloc : Access a group of rows by integer position(s). + + Notes + ----- + See :ref:`Fast scalar value getting and setting ` + for more details. + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> df + A B C + 4 0 2 3 + 5 0 4 1 + 6 10 20 30 + + Get value at specified row/column pair + + >>> df.at[4, 'B'] + 2 + + Set value at specified row/column pair + + >>> df.at[4, 'B'] = 10 + >>> df.at[4, 'B'] + 10 + + Get value within a Series + + >>> df.loc[5].at['B'] + 4 + """ + return _AtIndexer("at", self) + + @property + def iat(self) -> _iAtIndexer: + """ + Access a single value for a row/column pair by integer position. + + Similar to ``iloc``, in that both provide integer-based lookups. Use + ``iat`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + IndexError + When integer position is out of bounds. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.loc : Access a group of rows and columns by label(s). + DataFrame.iloc : Access a group of rows and columns by integer position(s). + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + Get value at specified row/column pair + + >>> df.iat[1, 2] + 1 + + Set value at specified row/column pair + + >>> df.iat[1, 2] = 10 + >>> df.iat[1, 2] + 10 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + """ + return _iAtIndexer("iat", self) + + + class _LocationIndexer(NDFrameIndexerBase): + _valid_types: str + axis: AxisInt | None = None + + # sub-classes need to set _takeable + _takeable: bool + + @final + def __call__(self, axis: Axis | None = None) -> Self: + # we need to return a copy of ourselves + new_self = type(self)(self.name, self.obj) + + if axis is not None: + axis_int_none = self.obj._get_axis_number(axis) + else: + axis_int_none = axis + new_self.axis = axis_int_none + return new_self + + def _get_setitem_indexer(self, key): + """ + Convert a potentially-label-based key into a positional indexer. + """ + if self.name == "loc": + # always holds here bc iloc overrides _get_setitem_indexer + self._ensure_listlike_indexer(key) + + if isinstance(key, tuple): + for x in key: + check_dict_or_set_indexers(x) + + if self.axis is not None: + key = _tupleize_axis_indexer(self.ndim, self.axis, key) + + ax = self.obj._get_axis(0) + + if ( + isinstance(ax, MultiIndex) + and self.name != "iloc" + and is_hashable(key) + and not isinstance(key, slice) + ): + with suppress(KeyError, InvalidIndexError): + # TypeError e.g. passed a bool + return ax.get_loc(key) + + if isinstance(key, tuple): + with suppress(IndexingError): + # suppress "Too many indexers" + return self._convert_tuple(key) + + if isinstance(key, range): + # GH#45479 test_loc_setitem_range_key + key = list(key) + + return self._convert_to_indexer(key, axis=0) + + @final + def _maybe_mask_setitem_value(self, indexer, value): + """ + If we have obj.iloc[mask] = series_or_frame and series_or_frame has the + same length as obj, we treat this as obj.iloc[mask] = series_or_frame[mask], + similar to Series.__setitem__. + + Note this is only for loc, not iloc. + """ + + if ( + isinstance(indexer, tuple) + and len(indexer) == 2 + and isinstance(value, (ABCSeries, ABCDataFrame)) + ): + pi, icols = indexer + ndim = value.ndim + if com.is_bool_indexer(pi) and len(value) == len(pi): + newkey = pi.nonzero()[0] + + if is_scalar_indexer(icols, self.ndim - 1) and ndim == 1: + # e.g. test_loc_setitem_boolean_mask_allfalse + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_series(indexer, value) + indexer = (newkey, icols) + + elif ( + isinstance(icols, np.ndarray) + and icols.dtype.kind == "i" + and len(icols) == 1 + ): + if ndim == 1: + # We implicitly broadcast, though numpy does not, see + # github.com/pandas-dev/pandas/pull/45501#discussion_r789071825 + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_series(indexer, value) + indexer = (newkey, icols) + + elif ndim == 2 and value.shape[1] == 1: + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_frame(indexer, value) + indexer = (newkey, icols) + elif com.is_bool_indexer(indexer): + indexer = indexer.nonzero()[0] + + return indexer, value + + @final + def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: + """ + Ensure that a list-like of column labels are all present by adding them if + they do not already exist. + + Parameters + ---------- + key : list-like of column labels + Target labels. + axis : key axis if known + """ + column_axis = 1 + + # column only exists in 2-dimensional DataFrame + if self.ndim != 2: + return + + orig_key = key + if isinstance(key, tuple) and len(key) > 1: + # key may be a tuple if we are .loc + # if length of key is > 1 set key to column part + key = key[column_axis] + axis = column_axis + + if ( + axis == column_axis + and not isinstance(self.obj.columns, MultiIndex) + and is_list_like_indexer(key) + and not com.is_bool_indexer(key) + and all(is_hashable(k) for k in key) + ): + # GH#38148 + keys = self.obj.columns.union(key, sort=False) + diff = Index(key).difference(self.obj.columns, sort=False) + + if len(diff) and com.is_null_slice(orig_key[0]): + # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" + # is a new column, add the new columns with dtype=np.void + # so that later when we go through setitem_single_column + # we will use isetitem. Without this, the reindex_axis + # below would create float64 columns in this example, which + # would successfully hold 7, so we would end up with the wrong + # dtype. + indexer = np.arange(len(keys), dtype=np.intp) + indexer[len(self.obj.columns) :] = -1 + new_mgr = self.obj._mgr.reindex_indexer( + keys, indexer=indexer, axis=0, only_slice=True, use_na_proxy=True + ) + self.obj._mgr = new_mgr + return + + self.obj._mgr = self.obj._mgr.reindex_axis(keys, axis=0, only_slice=True) + + @final + def __setitem__(self, key, value) -> None: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self.obj) <= 2: + warnings.warn( + _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 + ) + + check_dict_or_set_indexers(key) + if isinstance(key, tuple): + key = tuple(list(x) if is_iterator(x) else x for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + else: + key = com.apply_if_callable(key, self.obj) + indexer = self._get_setitem_indexer(key) + self._has_valid_setitem_indexer(key) + + iloc = self if self.name == "iloc" else self.obj.iloc + iloc._setitem_with_indexer(indexer, value, self.name) + + def _validate_key(self, key, axis: AxisInt): + """ + Ensure that key is valid for current indexer. + + Parameters + ---------- + key : scalar, slice or list-like + Key requested. + axis : int + Dimension on which the indexing is being made. + + Raises + ------ + TypeError + If the key (or some element of it) has wrong type. + IndexError + If the key (or some element of it) is out of bounds. + KeyError + If the key was not found. + """ + raise AbstractMethodError(self) + + @final + def _expand_ellipsis(self, tup: tuple) -> tuple: + """ + If a tuple key includes an Ellipsis, replace it with an appropriate + number of null slices. + """ + if any(x is Ellipsis for x in tup): + if tup.count(Ellipsis) > 1: + raise IndexingError(_one_ellipsis_message) + + if len(tup) == self.ndim: + # It is unambiguous what axis this Ellipsis is indexing, + # treat as a single null slice. + i = tup.index(Ellipsis) + # FIXME: this assumes only one Ellipsis + new_key = tup[:i] + (_NS,) + tup[i + 1 :] + return new_key + + # TODO: other cases? only one test gets here, and that is covered + # by _validate_key_length + return tup + + @final + def _validate_tuple_indexer(self, key: tuple) -> tuple: + """ + Check the key for valid keys across my indexer. + """ + key = self._validate_key_length(key) + key = self._expand_ellipsis(key) + for i, k in enumerate(key): + try: + self._validate_key(k, i) + except ValueError as err: + raise ValueError( + "Location based indexing can only have " + f"[{self._valid_types}] types" + ) from err + return key + + @final + def _is_nested_tuple_indexer(self, tup: tuple) -> bool: + """ + Returns + ------- + bool + """ + if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): + return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) + return False + + @final + def _convert_tuple(self, key: tuple) -> tuple: + # Note: we assume _tupleize_axis_indexer has been called, if necessary. + self._validate_key_length(key) + keyidx = [self._convert_to_indexer(k, axis=i) for i, k in enumerate(key)] + return tuple(keyidx) + + @final + def _validate_key_length(self, key: tuple) -> tuple: + if len(key) > self.ndim: + if key[0] is Ellipsis: + # e.g. Series.iloc[..., 3] reduces to just Series.iloc[3] + key = key[1:] + if Ellipsis in key: + raise IndexingError(_one_ellipsis_message) + return self._validate_key_length(key) + raise IndexingError("Too many indexers") + return key + + @final + def _getitem_tuple_same_dim(self, tup: tuple): + """ + Index with indexers that should return an object of the same dimension + as self.obj. + + This is only called after a failed call to _getitem_lowerdim. + """ + retval = self.obj + # Selecting columns before rows is signficiantly faster + start_val = (self.ndim - len(tup)) + 1 + for i, key in enumerate(reversed(tup)): + i = self.ndim - i - start_val + if com.is_null_slice(key): + continue + + retval = getattr(retval, self.name)._getitem_axis(key, axis=i) + # We should never have retval.ndim < self.ndim, as that should + # be handled by the _getitem_lowerdim call above. + assert retval.ndim == self.ndim + + if retval is self.obj: + # if all axes were a null slice (`df.loc[:, :]`), ensure we still + # return a new object (https://github.com/pandas-dev/pandas/pull/49469) + retval = retval.copy(deep=False) + + return retval + + @final + def _getitem_lowerdim(self, tup: tuple): + # we can directly get the axis result since the axis is specified + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + return self._getitem_axis(tup, axis=axis) + + # we may have a nested tuples indexer here + if self._is_nested_tuple_indexer(tup): + return self._getitem_nested_tuple(tup) + + # we maybe be using a tuple to represent multiple dimensions here + ax0 = self.obj._get_axis(0) + # ...but iloc should handle the tuple as simple integer-location + # instead of checking it as multiindex representation (GH 13797) + if ( + isinstance(ax0, MultiIndex) + and self.name != "iloc" + and not any(isinstance(x, slice) for x in tup) + ): + # Note: in all extant test cases, replacing the slice condition with + # `all(is_hashable(x) or com.is_null_slice(x) for x in tup)` + # is equivalent. + # (see the other place where we call _handle_lowerdim_multi_index_axis0) + with suppress(IndexingError): + return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0(tup) + + tup = self._validate_key_length(tup) + + for i, key in enumerate(tup): + if is_label_like(key): + # We don't need to check for tuples here because those are + # caught by the _is_nested_tuple_indexer check above. + section = self._getitem_axis(key, axis=i) + + # We should never have a scalar section here, because + # _getitem_lowerdim is only called after a check for + # is_scalar_access, which that would be. + if section.ndim == self.ndim: + # we're in the middle of slicing through a MultiIndex + # revise the key wrt to `section` by inserting an _NS + new_key = tup[:i] + (_NS,) + tup[i + 1 :] + + else: + # Note: the section.ndim == self.ndim check above + # rules out having DataFrame here, so we dont need to worry + # about transposing. + new_key = tup[:i] + tup[i + 1 :] + + if len(new_key) == 1: + new_key = new_key[0] + + # Slices should return views, but calling iloc/loc with a null + # slice returns a new object. + if com.is_null_slice(new_key): + return section + # This is an elided recursive call to iloc/loc + return getattr(section, self.name)[new_key] + + raise IndexingError("not applicable") + + @final + def _getitem_nested_tuple(self, tup: tuple): + # we have a nested tuple so have at least 1 multi-index level + # we should be able to match up the dimensionality here + + def _contains_slice(x: object) -> bool: + # Check if object is a slice or a tuple containing a slice + if isinstance(x, tuple): + return any(isinstance(v, slice) for v in x) + elif isinstance(x, slice): + return True + return False + + for key in tup: + check_dict_or_set_indexers(key) + + # we have too many indexers for our dim, but have at least 1 + # multi-index dimension, try to see if we have something like + # a tuple passed to a series with a multi-index + if len(tup) > self.ndim: + if self.name != "loc": + # This should never be reached, but let's be explicit about it + raise ValueError("Too many indices") # pragma: no cover + if all( + (is_hashable(x) and not _contains_slice(x)) or com.is_null_slice(x) + for x in tup + ): + # GH#10521 Series should reduce MultiIndex dimensions instead of + # DataFrame, IndexingError is not raised when slice(None,None,None) + # with one row. + with suppress(IndexingError): + return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0( + tup + ) + elif isinstance(self.obj, ABCSeries) and any( + isinstance(k, tuple) for k in tup + ): + # GH#35349 Raise if tuple in tuple for series + # Do this after the all-hashable-or-null-slice check so that + # we are only getting non-hashable tuples, in particular ones + # that themselves contain a slice entry + # See test_loc_series_getitem_too_many_dimensions + raise IndexingError("Too many indexers") + + # this is a series with a multi-index specified a tuple of + # selectors + axis = self.axis or 0 + return self._getitem_axis(tup, axis=axis) + + # handle the multi-axis by taking sections and reducing + # this is iterative + obj = self.obj + # GH#41369 Loop in reverse order ensures indexing along columns before rows + # which selects only necessary blocks which avoids dtype conversion if possible + axis = len(tup) - 1 + for key in tup[::-1]: + if com.is_null_slice(key): + axis -= 1 + continue + + obj = getattr(obj, self.name)._getitem_axis(key, axis=axis) + axis -= 1 + + # if we have a scalar, we are done + if is_scalar(obj) or not hasattr(obj, "ndim"): + break + + return obj + + def _convert_to_indexer(self, key, axis: AxisInt): + raise AbstractMethodError(self) + + @final + def __getitem__(self, key): + check_dict_or_set_indexers(key) + if type(key) is tuple: + key = tuple(list(x) if is_iterator(x) else x for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + if self._is_scalar_access(key): + return self.obj._get_value(*key, takeable=self._takeable) + return self._getitem_tuple(key) + else: + # we by definition only have the 0th axis + axis = self.axis or 0 + + maybe_callable = com.apply_if_callable(key, self.obj) + return self._getitem_axis(maybe_callable, axis=axis) + + def _is_scalar_access(self, key: tuple): + raise NotImplementedError() + + def _getitem_tuple(self, tup: tuple): + raise AbstractMethodError(self) + + def _getitem_axis(self, key, axis: AxisInt): + raise NotImplementedError() + + def _has_valid_setitem_indexer(self, indexer) -> bool: + raise AbstractMethodError(self) + + @final + def _getbool_axis(self, key, axis: AxisInt): + # caller is responsible for ensuring non-None axis + labels = self.obj._get_axis(axis) + key = check_bool_indexer(labels, key) + inds = key.nonzero()[0] + return self.obj._take_with_is_copy(inds, axis=axis) + + + @doc(IndexingMixin.loc) + class _LocIndexer(_LocationIndexer): + _takeable: bool = False + _valid_types = ( + "labels (MUST BE IN THE INDEX), slices of labels (BOTH " + "endpoints included! Can be slices of integers if the " + "index is integers), listlike of labels, boolean" + ) + + # ------------------------------------------------------------------- + # Key Checks + + @doc(_LocationIndexer._validate_key) + def _validate_key(self, key, axis: Axis): + # valid for a collection of labels (we check their presence later) + # slice of labels (where start-end in labels) + # slice of integers (only if in the labels) + # boolean not in slice and with boolean index + ax = self.obj._get_axis(axis) + if isinstance(key, bool) and not ( + is_bool_dtype(ax.dtype) + or ax.dtype.name == "boolean" + or isinstance(ax, MultiIndex) + and is_bool_dtype(ax.get_level_values(0).dtype) + ): + raise KeyError( + f"{key}: boolean label can not be used without a boolean index" + ) + + if isinstance(key, slice) and ( + isinstance(key.start, bool) or isinstance(key.stop, bool) + ): + raise TypeError(f"{key}: boolean values can not be used in a slice") + + def _has_valid_setitem_indexer(self, indexer) -> bool: + return True + + def _is_scalar_access(self, key: tuple) -> bool: + """ + Returns + ------- + bool + """ + # this is a shortcut accessor to both .loc and .iloc + # that provide the equivalent access of .at and .iat + # a) avoid getting things via sections and (to minimize dtype changes) + # b) provide a performant path + if len(key) != self.ndim: + return False + + for i, k in enumerate(key): + if not is_scalar(k): + return False + + ax = self.obj.axes[i] + if isinstance(ax, MultiIndex): + return False + + if isinstance(k, str) and ax._supports_partial_string_indexing: + # partial string indexing, df.loc['2000', 'A'] + # should not be considered scalar + return False + + if not ax._index_as_unique: + return False + + return True + + # ------------------------------------------------------------------- + # MultiIndex Handling + + def _multi_take_opportunity(self, tup: tuple) -> bool: + """ + Check whether there is the possibility to use ``_multi_take``. + + Currently the limit is that all axes being indexed, must be indexed with + list-likes. + + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis. + + Returns + ------- + bool + Whether the current indexing, + can be passed through `_multi_take`. + """ + if not all(is_list_like_indexer(x) for x in tup): + return False + + # just too complicated + return not any(com.is_bool_indexer(x) for x in tup) + + def _multi_take(self, tup: tuple): + """ + Create the indexers for the passed tuple of keys, and + executes the take operation. This allows the take operation to be + executed all at once, rather than once for each dimension. + Improving efficiency. + + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis. + + Returns + ------- + values: same type as the object being indexed + """ + # GH 836 + d = { + axis: self._get_listlike_indexer(key, axis) + for (key, axis) in zip(tup, self.obj._AXIS_ORDERS) + } + return self.obj._reindex_with_indexers(d, copy=True, allow_dups=True) + + # ------------------------------------------------------------------- + + def _getitem_iterable(self, key, axis: AxisInt): + """ + Index current object with an iterable collection of keys. + + Parameters + ---------- + key : iterable + Targeted labels. + axis : int + Dimension on which the indexing is being made. + + Raises + ------ + KeyError + If no key was found. Will change in the future to raise if not all + keys were found. + + Returns + ------- + scalar, DataFrame, or Series: indexed value(s). + """ + # we assume that not com.is_bool_indexer(key), as that is + # handled before we get here. + self._validate_key(key, axis) + + # A collection of keys + keyarr, indexer = self._get_listlike_indexer(key, axis) + return self.obj._reindex_with_indexers( + {axis: [keyarr, indexer]}, copy=True, allow_dups=True + ) + + def _getitem_tuple(self, tup: tuple): + with suppress(IndexingError): + tup = self._expand_ellipsis(tup) + return self._getitem_lowerdim(tup) + + # no multi-index, so validate all of the indexers + tup = self._validate_tuple_indexer(tup) + + # ugly hack for GH #836 + if self._multi_take_opportunity(tup): + return self._multi_take(tup) + + return self._getitem_tuple_same_dim(tup) + + def _get_label(self, label, axis: AxisInt): + # GH#5567 this will fail if the label is not present in the axis. + return self.obj.xs(label, axis=axis) + + def _handle_lowerdim_multi_index_axis0(self, tup: tuple): + # we have an axis0 multi-index, handle or raise + axis = self.axis or 0 + try: + # fast path for series or for tup devoid of slices + return self._get_label(tup, axis=axis) + + except KeyError as ek: + # raise KeyError if number of indexers match + # else IndexingError will be raised + if self.ndim < len(tup) <= self.obj.index.nlevels: + raise ek + raise IndexingError("No label returned") from ek + + def _getitem_axis(self, key, axis: AxisInt): + key = item_from_zerodim(key) + if is_iterator(key): + key = list(key) + if key is Ellipsis: + key = slice(None) + + labels = self.obj._get_axis(axis) + + if isinstance(key, tuple) and isinstance(labels, MultiIndex): + key = tuple(key) + + if isinstance(key, slice): + self._validate_key(key, axis) + return self._get_slice_axis(key, axis=axis) + elif com.is_bool_indexer(key): + return self._getbool_axis(key, axis=axis) + elif is_list_like_indexer(key): + # an iterable multi-selection + if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): + if hasattr(key, "ndim") and key.ndim > 1: + raise ValueError("Cannot index with multidimensional key") + + return self._getitem_iterable(key, axis=axis) + + # nested tuple slicing + if is_nested_tuple(key, labels): + locs = labels.get_locs(key) + indexer = [slice(None)] * self.ndim + indexer[axis] = locs + return self.obj.iloc[tuple(indexer)] + + # fall thru to straight lookup + self._validate_key(key, axis) + return self._get_label(key, axis=axis) + + def _get_slice_axis(self, slice_obj: slice, axis: AxisInt): + """ + This is pretty simple as we just have to deal with labels. + """ + # caller is responsible for ensuring non-None axis + obj = self.obj + if not need_slice(slice_obj): + return obj.copy(deep=False) + + labels = obj._get_axis(axis) + indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step) + + if isinstance(indexer, slice): + return self.obj._slice(indexer, axis=axis) + else: + # DatetimeIndex overrides Index.slice_indexer and may + # return a DatetimeIndex instead of a slice object. + return self.obj.take(indexer, axis=axis) + + def _convert_to_indexer(self, key, axis: AxisInt): + """ + Convert indexing key into something we can use to do actual fancy + indexing on a ndarray. + + Examples + ix[:5] -> slice(0, 5) + ix[[1,2,3]] -> [1,2,3] + ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) + + Going by Zen of Python? + 'In the face of ambiguity, refuse the temptation to guess.' + raise AmbiguousIndexError with integer labels? + - No, prefer label-based indexing + """ + labels = self.obj._get_axis(axis) + + if isinstance(key, slice): + return labels._convert_slice_indexer(key, kind="loc") + + if ( + isinstance(key, tuple) + and not isinstance(labels, MultiIndex) + and self.ndim < 2 + and len(key) > 1 + ): + raise IndexingError("Too many indexers") + + # Slices are not valid keys passed in by the user, + # even though they are hashable in Python 3.12 + contains_slice = False + if isinstance(key, tuple): + contains_slice = any(isinstance(v, slice) for v in key) + + if is_scalar(key) or ( + isinstance(labels, MultiIndex) and is_hashable(key) and not contains_slice + ): + # Otherwise get_loc will raise InvalidIndexError + + # if we are a label return me + try: + return labels.get_loc(key) + except LookupError: + if isinstance(key, tuple) and isinstance(labels, MultiIndex): + if len(key) == labels.nlevels: + return {"key": key} + raise + except InvalidIndexError: + # GH35015, using datetime as column indices raises exception + if not isinstance(labels, MultiIndex): + raise + except ValueError: + if not is_integer(key): + raise + return {"key": key} + + if is_nested_tuple(key, labels): + if self.ndim == 1 and any(isinstance(k, tuple) for k in key): + # GH#35349 Raise if tuple in tuple for series + raise IndexingError("Too many indexers") + return labels.get_locs(key) + + elif is_list_like_indexer(key): + if is_iterator(key): + key = list(key) + + if com.is_bool_indexer(key): + key = check_bool_indexer(labels, key) + return key + else: + return self._get_listlike_indexer(key, axis)[1] + else: + try: + return labels.get_loc(key) + except LookupError: + # allow a not found key only if we are a setter + if not is_list_like_indexer(key): + return {"key": key} + raise + + def _get_listlike_indexer(self, key, axis: AxisInt): + """ + Transform a list-like of keys into a new index and an indexer. + + Parameters + ---------- + key : list-like + Targeted labels. + axis: int + Dimension on which the indexing is being made. + + Raises + ------ + KeyError + If at least one key was requested but none was found. + + Returns + ------- + keyarr: Index + New index (coinciding with 'key' if the axis is unique). + values : array-like + Indexer for the return object, -1 denotes keys not found. + """ + ax = self.obj._get_axis(axis) + axis_name = self.obj._get_axis_name(axis) + + keyarr, indexer = ax._get_indexer_strict(key, axis_name) + + return keyarr, indexer + + + @doc(IndexingMixin.iloc) + class _iLocIndexer(_LocationIndexer): + _valid_types = ( + "integer, integer slice (START point is INCLUDED, END " + "point is EXCLUDED), listlike of integers, boolean array" + ) + _takeable = True + + # ------------------------------------------------------------------- + # Key Checks + + def _validate_key(self, key, axis: AxisInt): + if com.is_bool_indexer(key): + if hasattr(key, "index") and isinstance(key.index, Index): + if key.index.inferred_type == "integer": + raise NotImplementedError( + "iLocation based boolean " + "indexing on an integer type " + "is not available" + ) + raise ValueError( + "iLocation based boolean indexing cannot use " + "an indexable as a mask" + ) + return + + if isinstance(key, slice): + return + elif is_integer(key): + self._validate_integer(key, axis) + elif isinstance(key, tuple): + # a tuple should already have been caught by this point + # so don't treat a tuple as a valid indexer + raise IndexingError("Too many indexers") + elif is_list_like_indexer(key): + if isinstance(key, ABCSeries): + arr = key._values + elif is_array_like(key): + arr = key + else: + arr = np.array(key) + len_axis = len(self.obj._get_axis(axis)) + + # check that the key has a numeric dtype + if not is_numeric_dtype(arr.dtype): + raise IndexError(f".iloc requires numeric indexers, got {arr}") + + # check that the key does not exceed the maximum size of the index + if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis): + raise IndexError("positional indexers are out-of-bounds") + else: + raise ValueError(f"Can only index by location with a [{self._valid_types}]") + + def _has_valid_setitem_indexer(self, indexer) -> bool: + """ + Validate that a positional indexer cannot enlarge its target + will raise if needed, does not modify the indexer externally. + + Returns + ------- + bool + """ + if isinstance(indexer, dict): + raise IndexError("iloc cannot enlarge its target object") + + if isinstance(indexer, ABCDataFrame): + raise TypeError( + "DataFrame indexer for .iloc is not supported. " + "Consider using .loc with a DataFrame indexer for automatic alignment.", + ) + + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + + for ax, i in zip(self.obj.axes, indexer): + if isinstance(i, slice): + # should check the stop slice? + pass + elif is_list_like_indexer(i): + # should check the elements? + pass + elif is_integer(i): + if i >= len(ax): + raise IndexError("iloc cannot enlarge its target object") + elif isinstance(i, dict): + raise IndexError("iloc cannot enlarge its target object") + + return True + + def _is_scalar_access(self, key: tuple) -> bool: + """ + Returns + ------- + bool + """ + # this is a shortcut accessor to both .loc and .iloc + # that provide the equivalent access of .at and .iat + # a) avoid getting things via sections and (to minimize dtype changes) + # b) provide a performant path + if len(key) != self.ndim: + return False + + return all(is_integer(k) for k in key) + + def _validate_integer(self, key: int | np.integer, axis: AxisInt) -> None: + """ + Check that 'key' is a valid position in the desired axis. + + Parameters + ---------- + key : int + Requested position. + axis : int + Desired axis. + + Raises + ------ + IndexError + If 'key' is not a valid position in axis 'axis'. + """ + len_axis = len(self.obj._get_axis(axis)) + if key >= len_axis or key < -len_axis: + raise IndexError("single positional indexer is out-of-bounds") + + # ------------------------------------------------------------------- + + def _getitem_tuple(self, tup: tuple): + tup = self._validate_tuple_indexer(tup) + with suppress(IndexingError): + return self._getitem_lowerdim(tup) + + return self._getitem_tuple_same_dim(tup) + + def _get_list_axis(self, key, axis: AxisInt): + """ + Return Series values by list or array of integers. + + Parameters + ---------- + key : list-like positional indexer + axis : int + + Returns + ------- + Series object + + Notes + ----- + `axis` can only be zero. + """ + try: + return self.obj._take_with_is_copy(key, axis=axis) + except IndexError as err: + # re-raise with different error message, e.g. test_getitem_ndarray_3d + raise IndexError("positional indexers are out-of-bounds") from err + + def _getitem_axis(self, key, axis: AxisInt): + if key is Ellipsis: + key = slice(None) + elif isinstance(key, ABCDataFrame): + raise IndexError( + "DataFrame indexer is not allowed for .iloc\n" + "Consider using .loc for automatic alignment." + ) + + if isinstance(key, slice): + return self._get_slice_axis(key, axis=axis) + + if is_iterator(key): + key = list(key) + + if isinstance(key, list): + key = np.asarray(key) + + if com.is_bool_indexer(key): + self._validate_key(key, axis) + return self._getbool_axis(key, axis=axis) + + # a list of integers + elif is_list_like_indexer(key): + return self._get_list_axis(key, axis=axis) + + # a single integer + else: + key = item_from_zerodim(key) + if not is_integer(key): + raise TypeError("Cannot index by location index with a non-integer key") + + # validate the location + self._validate_integer(key, axis) + + return self.obj._ixs(key, axis=axis) + + def _get_slice_axis(self, slice_obj: slice, axis: AxisInt): + # caller is responsible for ensuring non-None axis + obj = self.obj + + if not need_slice(slice_obj): + return obj.copy(deep=False) + + labels = obj._get_axis(axis) + labels._validate_positional_slice(slice_obj) + return self.obj._slice(slice_obj, axis=axis) + + def _convert_to_indexer(self, key, axis: AxisInt): + """ + Much simpler as we only have to deal with our valid types. + """ + return key + + def _get_setitem_indexer(self, key): + # GH#32257 Fall through to let numpy do validation + if is_iterator(key): + key = list(key) + + if self.axis is not None: + key = _tupleize_axis_indexer(self.ndim, self.axis, key) + + return key + + # ------------------------------------------------------------------- + + def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): + """ + _setitem_with_indexer is for setting values on a Series/DataFrame + using positional indexers. + + If the relevant keys are not present, the Series/DataFrame may be + expanded. + + This method is currently broken when dealing with non-unique Indexes, + since it goes from positional indexers back to labels when calling + BlockManager methods, see GH#12991, GH#22046, GH#15686. + """ + info_axis = self.obj._info_axis_number + + # maybe partial set + take_split_path = not self.obj._mgr.is_single_block + + if not take_split_path and isinstance(value, ABCDataFrame): + # Avoid cast of values + take_split_path = not value._mgr.is_single_block + + # if there is only one block/type, still have to take split path + # unless the block is one-dimensional or it can hold the value + if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1: + # in case of dict, keys are indices + val = list(value.values()) if isinstance(value, dict) else value + arr = self.obj._mgr.arrays[0] + take_split_path = not can_hold_element( + arr, extract_array(val, extract_numpy=True) + ) + + # if we have any multi-indexes that have non-trivial slices + # (not null slices) then we must take the split path, xref + # GH 10360, GH 27841 + if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): + for i, ax in zip(indexer, self.obj.axes): + if isinstance(ax, MultiIndex) and not ( + is_integer(i) or com.is_null_slice(i) + ): + take_split_path = True + break + + if isinstance(indexer, tuple): + nindexer = [] + for i, idx in enumerate(indexer): + if isinstance(idx, dict): + # reindex the axis to the new value + # and set inplace + key, _ = convert_missing_indexer(idx) + + # if this is the items axes, then take the main missing + # path first + # this correctly sets the dtype and avoids cache issues + # essentially this separates out the block that is needed + # to possibly be modified + if self.ndim > 1 and i == info_axis: + # add the new item, and set the value + # must have all defined axes if we have a scalar + # or a list-like on the non-info axes if we have a + # list-like + if not len(self.obj): + if not is_list_like_indexer(value): + raise ValueError( + "cannot set a frame with no " + "defined index and a scalar" + ) + self.obj[key] = value + return + + # add a new item with the dtype setup + if com.is_null_slice(indexer[0]): + # We are setting an entire column + self.obj[key] = value + return + elif is_array_like(value): + # GH#42099 + arr = extract_array(value, extract_numpy=True) + taker = -1 * np.ones(len(self.obj), dtype=np.intp) + empty_value = algos.take_nd(arr, taker) + if not isinstance(value, ABCSeries): + # if not Series (in which case we need to align), + # we can short-circuit + if ( + isinstance(arr, np.ndarray) + and arr.ndim == 1 + and len(arr) == 1 + ): + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + arr = arr[0, ...] + empty_value[indexer[0]] = arr + self.obj[key] = empty_value + return + + self.obj[key] = empty_value + + else: + # FIXME: GH#42099#issuecomment-864326014 + self.obj[key] = infer_fill_value(value) + + new_indexer = convert_from_missing_indexer_tuple( + indexer, self.obj.axes + ) + self._setitem_with_indexer(new_indexer, value, name) + + return + + # reindex the axis + # make sure to clear the cache because we are + # just replacing the block manager here + # so the object is the same + index = self.obj._get_axis(i) + labels = index.insert(len(index), key) + + # We are expanding the Series/DataFrame values to match + # the length of thenew index `labels`. GH#40096 ensure + # this is valid even if the index has duplicates. + taker = np.arange(len(index) + 1, dtype=np.intp) + taker[-1] = -1 + reindexers = {i: (labels, taker)} + new_obj = self.obj._reindex_with_indexers( + reindexers, allow_dups=True + ) + self.obj._mgr = new_obj._mgr + self.obj._maybe_update_cacher(clear=True) + self.obj._is_copy = None + + nindexer.append(labels.get_loc(key)) + + else: + nindexer.append(idx) + + indexer = tuple(nindexer) + else: + indexer, missing = convert_missing_indexer(indexer) + + if missing: + self._setitem_with_indexer_missing(indexer, value) + return + + if name == "loc": + # must come after setting of missing + indexer, value = self._maybe_mask_setitem_value(indexer, value) + + # align and set the values + if take_split_path: + # We have to operate column-wise + self._setitem_with_indexer_split_path(indexer, value, name) + else: + self._setitem_single_block(indexer, value, name) + + def _setitem_with_indexer_split_path(self, indexer, value, name: str): + """ + Setitem column-wise. + """ + # Above we only set take_split_path to True for 2D cases + assert self.ndim == 2 + + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + if len(indexer) > self.ndim: + raise IndexError("too many indices for array") + if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: + raise ValueError(r"Cannot set values with ndim > 2") + + if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): + from pandas import Series + + value = self._align_series(indexer, Series(value)) + + # Ensure we have something we can iterate over + info_axis = indexer[1] + ilocs = self._ensure_iterable_column_indexer(info_axis) + + pi = indexer[0] + lplane_indexer = length_of_indexer(pi, self.obj.index) + # lplane_indexer gives the expected length of obj[indexer[0]] + + # we need an iterable, with a ndim of at least 1 + # eg. don't pass through np.array(0) + if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: + if isinstance(value, ABCDataFrame): + self._setitem_with_indexer_frame_value(indexer, value, name) + + elif np.ndim(value) == 2: + # TODO: avoid np.ndim call in case it isn't an ndarray, since + # that will construct an ndarray, which will be wasteful + self._setitem_with_indexer_2d_value(indexer, value) + + elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): + # We are setting multiple rows in a single column. + self._setitem_single_column(ilocs[0], value, pi) + + elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): + # We are trying to set N values into M entries of a single + # column, which is invalid for N != M + # Exclude zero-len for e.g. boolean masking that is all-false + + if len(value) == 1 and not is_integer(info_axis): + # This is a case like df.iloc[:3, [1]] = [0] + # where we treat as df.iloc[:3, 1] = 0 + return self._setitem_with_indexer((pi, info_axis[0]), value[0]) + + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) + + elif lplane_indexer == 0 and len(value) == len(self.obj.index): + # We get here in one case via .loc with a all-False mask + pass + + elif self._is_scalar_access(indexer) and is_object_dtype( + self.obj.dtypes._values[ilocs[0]] + ): + # We are setting nested data, only possible for object dtype data + self._setitem_single_column(indexer[1], value, pi) + + elif len(ilocs) == len(value): + # We are setting multiple columns in a single row. + for loc, v in zip(ilocs, value): + self._setitem_single_column(loc, v, pi) + + elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: + # This is a setitem-with-expansion, see + # test_loc_setitem_empty_append_expands_rows_mixed_dtype + # e.g. df = DataFrame(columns=["x", "y"]) + # df["x"] = df["x"].astype(np.int64) + # df.loc[:, "x"] = [1, 2, 3] + self._setitem_single_column(ilocs[0], value, pi) + + else: + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) + + else: + # scalar value + for loc in ilocs: + self._setitem_single_column(loc, value, pi) + + def _setitem_with_indexer_2d_value(self, indexer, value): + # We get here with np.ndim(value) == 2, excluding DataFrame, + # which goes through _setitem_with_indexer_frame_value + pi = indexer[0] + + ilocs = self._ensure_iterable_column_indexer(indexer[1]) + + if not is_array_like(value): + # cast lists to array + value = np.array(value, dtype=object) + if len(ilocs) != value.shape[1]: + raise ValueError( + "Must have equal len keys and value when setting with an ndarray" + ) + + for i, loc in enumerate(ilocs): + value_col = value[:, i] + if is_object_dtype(value_col.dtype): + # casting to list so that we do type inference in setitem_single_column + value_col = value_col.tolist() + self._setitem_single_column(loc, value_col, pi) + + def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): + ilocs = self._ensure_iterable_column_indexer(indexer[1]) + + sub_indexer = list(indexer) + pi = indexer[0] + + multiindex_indexer = isinstance(self.obj.columns, MultiIndex) + + unique_cols = value.columns.is_unique + + # We do not want to align the value in case of iloc GH#37728 + if name == "iloc": + for i, loc in enumerate(ilocs): + val = value.iloc[:, i] + self._setitem_single_column(loc, val, pi) + + elif not unique_cols and value.columns.equals(self.obj.columns): + # We assume we are already aligned, see + # test_iloc_setitem_frame_duplicate_columns_multiple_blocks + for loc in ilocs: + item = self.obj.columns[loc] + if item in value: + sub_indexer[1] = item + val = self._align_series( + tuple(sub_indexer), + value.iloc[:, loc], + multiindex_indexer, + ) + else: + val = np.nan + + self._setitem_single_column(loc, val, pi) + + elif not unique_cols: + raise ValueError("Setting with non-unique columns is not allowed.") + + else: + for loc in ilocs: + item = self.obj.columns[loc] + if item in value: + sub_indexer[1] = item + val = self._align_series( + tuple(sub_indexer), + value[item], + multiindex_indexer, + using_cow=using_copy_on_write(), + ) + else: + val = np.nan + + self._setitem_single_column(loc, val, pi) + + def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: + """ + + Parameters + ---------- + loc : int + Indexer for column position + plane_indexer : int, slice, listlike[int] + The indexer we use for setitem along axis=0. + """ + pi = plane_indexer + + is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)) + + is_null_setter = com.is_empty_slice(pi) or is_array_like(pi) and len(pi) == 0 + + if is_null_setter: + # no-op, don't cast dtype later + return + + elif is_full_setter: + try: + self.obj._mgr.column_setitem( + loc, plane_indexer, value, inplace_only=True + ) + except (ValueError, TypeError, LossySetitemError): + # If we're setting an entire column and we can't do it inplace, + # then we can use value's dtype (or inferred dtype) + # instead of object + self.obj.isetitem(loc, value) + else: + # set value into the column (first attempting to operate inplace, then + # falling back to casting if necessary) + self.obj._mgr.column_setitem(loc, plane_indexer, value) + + self.obj._clear_item_cache() + + def _setitem_single_block(self, indexer, value, name: str) -> None: + """ + _setitem_with_indexer for the case when we have a single Block. + """ + from pandas import Series + + info_axis = self.obj._info_axis_number + item_labels = self.obj._get_axis(info_axis) + if isinstance(indexer, tuple): + # if we are setting on the info axis ONLY + # set using those methods to avoid block-splitting + # logic here + if ( + self.ndim == len(indexer) == 2 + and is_integer(indexer[1]) + and com.is_null_slice(indexer[0]) + ): + col = item_labels[indexer[info_axis]] + if len(item_labels.get_indexer_for([col])) == 1: + # e.g. test_loc_setitem_empty_append_expands_rows + loc = item_labels.get_loc(col) + self._setitem_single_column(loc, value, indexer[0]) + return + + indexer = maybe_convert_ix(*indexer) # e.g. test_setitem_frame_align + + if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. + value = self._align_series(indexer, Series(value)) + + elif isinstance(value, ABCDataFrame) and name != "iloc": + value = self._align_frame(indexer, value)._values + + # check for chained assignment + self.obj._check_is_chained_assignment_possible() + + # actually do the set + self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) + self.obj._maybe_update_cacher(clear=True, inplace=True) + + def _setitem_with_indexer_missing(self, indexer, value): + """ + Insert new row(s) or column(s) into the Series or DataFrame. + """ + from pandas import Series + + # reindex the axis to the new value + # and set inplace + if self.ndim == 1: + index = self.obj.index + new_index = index.insert(len(index), indexer) + + # we have a coerced indexer, e.g. a float + # that matches in an int64 Index, so + # we will not create a duplicate index, rather + # index to that element + # e.g. 0.0 -> 0 + # GH#12246 + if index.is_unique: + # pass new_index[-1:] instead if [new_index[-1]] + # so that we retain dtype + new_indexer = index.get_indexer(new_index[-1:]) + if (new_indexer != -1).any(): + # We get only here with loc, so can hard code + return self._setitem_with_indexer(new_indexer, value, "loc") + + # this preserves dtype of the value and of the object + if not is_scalar(value): + new_dtype = None + + elif is_valid_na_for_dtype(value, self.obj.dtype): + if not is_object_dtype(self.obj.dtype): + # Every NA value is suitable for object, no conversion needed + value = na_value_for_dtype(self.obj.dtype, compat=False) + + new_dtype = maybe_promote(self.obj.dtype, value)[0] + + elif isna(value): + new_dtype = None + elif not self.obj.empty and not is_object_dtype(self.obj.dtype): + # We should not cast, if we have object dtype because we can + # set timedeltas into object series + curr_dtype = self.obj.dtype + curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype) + new_dtype = maybe_promote(curr_dtype, value)[0] + else: + new_dtype = None + + new_values = Series([value], dtype=new_dtype)._values + + if len(self.obj._values): + # GH#22717 handle casting compatibility that np.concatenate + # does incorrectly + new_values = concat_compat([self.obj._values, new_values]) + self.obj._mgr = self.obj._constructor( + new_values, index=new_index, name=self.obj.name + )._mgr + self.obj._maybe_update_cacher(clear=True) + + elif self.ndim == 2: + if not len(self.obj.columns): + # no columns and scalar + raise ValueError("cannot set a frame with no defined columns") + + has_dtype = hasattr(value, "dtype") + if isinstance(value, ABCSeries): + # append a Series + value = value.reindex(index=self.obj.columns, copy=True) + value.name = indexer + elif isinstance(value, dict): + value = Series( + value, index=self.obj.columns, name=indexer, dtype=object + ) + else: + # a list-list + if is_list_like_indexer(value): + # must have conforming columns + if len(value) != len(self.obj.columns): + raise ValueError("cannot set a row with mismatched columns") + + value = Series(value, index=self.obj.columns, name=indexer) + + if not len(self.obj): + # We will ignore the existing dtypes instead of using + # internals.concat logic + df = value.to_frame().T + + idx = self.obj.index + if isinstance(idx, MultiIndex): + name = idx.names + else: + name = idx.name + + df.index = Index([indexer], name=name) + if not has_dtype: + # i.e. if we already had a Series or ndarray, keep that + # dtype. But if we had a list or dict, then do inference + df = df.infer_objects(copy=False) + self.obj._mgr = df._mgr + else: + self.obj._mgr = self.obj._append(value)._mgr + self.obj._maybe_update_cacher(clear=True) + + def _ensure_iterable_column_indexer(self, column_indexer): + """ + Ensure that our column indexer is something that can be iterated over. + """ + ilocs: Sequence[int | np.integer] | np.ndarray + if is_integer(column_indexer): + ilocs = [column_indexer] + elif isinstance(column_indexer, slice): + ilocs = np.arange(len(self.obj.columns))[column_indexer] + elif ( + isinstance(column_indexer, np.ndarray) and column_indexer.dtype.kind == "b" + ): + ilocs = np.arange(len(column_indexer))[column_indexer] + else: + ilocs = column_indexer + return ilocs + + def _align_series( + self, + indexer, + ser: Series, + multiindex_indexer: bool = False, + using_cow: bool = False, + ): + """ + Parameters + ---------- + indexer : tuple, slice, scalar + Indexer used to get the locations that will be set to `ser`. + ser : pd.Series + Values to assign to the locations specified by `indexer`. + multiindex_indexer : bool, optional + Defaults to False. Should be set to True if `indexer` was from + a `pd.MultiIndex`, to avoid unnecessary broadcasting. + + Returns + ------- + `np.array` of `ser` broadcast to the appropriate shape for assignment + to the locations selected by `indexer` + """ + if isinstance(indexer, (slice, np.ndarray, list, Index)): + indexer = (indexer,) + + if isinstance(indexer, tuple): + # flatten np.ndarray indexers + def ravel(i): + return i.ravel() if isinstance(i, np.ndarray) else i + + indexer = tuple(map(ravel, indexer)) + + aligners = [not com.is_null_slice(idx) for idx in indexer] + sum_aligners = sum(aligners) + single_aligner = sum_aligners == 1 + is_frame = self.ndim == 2 + obj = self.obj + + # are we a single alignable value on a non-primary + # dim (e.g. panel: 1,2, or frame: 0) ? + # hence need to align to a single axis dimension + # rather that find all valid dims + + # frame + if is_frame: + single_aligner = single_aligner and aligners[0] + + # we have a frame, with multiple indexers on both axes; and a + # series, so need to broadcast (see GH5206) + if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): + ser_values = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values + + # single indexer + if len(indexer) > 1 and not multiindex_indexer: + len_indexer = len(indexer[1]) + ser_values = ( + np.tile(ser_values, len_indexer).reshape(len_indexer, -1).T + ) + + return ser_values + + for i, idx in enumerate(indexer): + ax = obj.axes[i] + + # multiple aligners (or null slices) + if is_sequence(idx) or isinstance(idx, slice): + if single_aligner and com.is_null_slice(idx): + continue + new_ix = ax[idx] + if not is_list_like_indexer(new_ix): + new_ix = Index([new_ix]) + else: + new_ix = Index(new_ix) + if ser.index.equals(new_ix): + if using_cow: + return ser + return ser._values.copy() + + return ser.reindex(new_ix)._values + + # 2 dims + elif single_aligner: + # reindex along index + ax = self.obj.axes[1] + if ser.index.equals(ax) or not len(ax): + return ser._values.copy() + return ser.reindex(ax)._values + + elif is_integer(indexer) and self.ndim == 1: + if is_object_dtype(self.obj.dtype): + return ser + ax = self.obj._get_axis(0) + + if ser.index.equals(ax): + return ser._values.copy() + + return ser.reindex(ax)._values[indexer] + + elif is_integer(indexer): + ax = self.obj._get_axis(1) + + if ser.index.equals(ax): + return ser._values.copy() + + return ser.reindex(ax)._values + + raise ValueError("Incompatible indexer with Series") + + def _align_frame(self, indexer, df: DataFrame) -> DataFrame: + is_frame = self.ndim == 2 + + if isinstance(indexer, tuple): + idx, cols = None, None + sindexers = [] + for i, ix in enumerate(indexer): + ax = self.obj.axes[i] + if is_sequence(ix) or isinstance(ix, slice): + if isinstance(ix, np.ndarray): + ix = ix.ravel() + if idx is None: + idx = ax[ix] + elif cols is None: + cols = ax[ix] + else: + break + else: + sindexers.append(i) + + if idx is not None and cols is not None: + if df.index.equals(idx) and df.columns.equals(cols): + val = df.copy() + else: + val = df.reindex(idx, columns=cols) + return val + + elif (isinstance(indexer, slice) or is_list_like_indexer(indexer)) and is_frame: + ax = self.obj.index[indexer] + if df.index.equals(ax): + val = df.copy() + else: + # we have a multi-index and are trying to align + # with a particular, level GH3738 + if ( + isinstance(ax, MultiIndex) + and isinstance(df.index, MultiIndex) + and ax.nlevels != df.index.nlevels + ): + raise TypeError( + "cannot align on a multi-index with out " + "specifying the join levels" + ) + + val = df.reindex(index=ax) + return val + + raise ValueError("Incompatible indexer with DataFrame") + + + class _ScalarAccessIndexer(NDFrameIndexerBase): + """ + Access scalars quickly. + """ + + # sub-classes need to set _takeable + _takeable: bool + + def _convert_key(self, key): + raise AbstractMethodError(self) + + def __getitem__(self, key): + if not isinstance(key, tuple): + # we could have a convertible item here (e.g. Timestamp) + if not is_list_like_indexer(key): + key = (key,) + else: + raise ValueError("Invalid call for scalar access (getting)!") + + key = self._convert_key(key) + return self.obj._get_value(*key, takeable=self._takeable) + + def __setitem__(self, key, value) -> None: + if isinstance(key, tuple): + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + else: + # scalar callable may return tuple + key = com.apply_if_callable(key, self.obj) + + if not isinstance(key, tuple): + key = _tuplify(self.ndim, key) + key = list(self._convert_key(key)) + if len(key) != self.ndim: + raise ValueError("Not enough indexers for scalar access (setting)!") + + self.obj._set_value(*key, value=value, takeable=self._takeable) + + + @doc(IndexingMixin.at) + class _AtIndexer(_ScalarAccessIndexer): + _takeable = False + + def _convert_key(self, key): + """ + Require they keys to be the same type as the index. (so we don't + fallback) + """ + # GH 26989 + # For series, unpacking key needs to result in the label. + # This is already the case for len(key) == 1; e.g. (1,) + if self.ndim == 1 and len(key) > 1: + key = (key,) + + return key + + @property + def _axes_are_unique(self) -> bool: + # Only relevant for self.ndim == 2 + assert self.ndim == 2 + return self.obj.index.is_unique and self.obj.columns.is_unique + + def __getitem__(self, key): + if self.ndim == 2 and not self._axes_are_unique: + # GH#33041 fall back to .loc + if not isinstance(key, tuple) or not all(is_scalar(x) for x in key): + raise ValueError("Invalid call for scalar access (getting)!") + return self.obj.loc[key] + + return super().__getitem__(key) + + def __setitem__(self, key, value) -> None: + if self.ndim == 2 and not self._axes_are_unique: + # GH#33041 fall back to .loc + if not isinstance(key, tuple) or not all(is_scalar(x) for x in key): + raise ValueError("Invalid call for scalar access (setting)!") + + self.obj.loc[key] = value + return + + return super().__setitem__(key, value) + + + @doc(IndexingMixin.iat) + class _iAtIndexer(_ScalarAccessIndexer): + _takeable = True + + def _convert_key(self, key): + """ + Require integer args. (and convert to label arguments) + """ + for i in key: + if not is_integer(i): + raise ValueError("iAt based indexing can only have integer indexers") + return key + + + def _tuplify(ndim: int, loc: Hashable) -> tuple[Hashable | slice, ...]: + """ + Given an indexer for the first dimension, create an equivalent tuple + for indexing over all dimensions. + + Parameters + ---------- + ndim : int + loc : object + + Returns + ------- + tuple + """ + _tup: list[Hashable | slice] + _tup = [slice(None, None) for _ in range(ndim)] + _tup[0] = loc + return tuple(_tup) + + + def _tupleize_axis_indexer(ndim: int, axis: AxisInt, key) -> tuple: + """ + If we have an axis, adapt the given key to be axis-independent. + """ + new_key = [slice(None)] * ndim + new_key[axis] = key + return tuple(new_key) + + + def check_bool_indexer(index: Index, key) -> np.ndarray: + """ + Check if key is a valid boolean indexer for an object with such index and + perform reindexing or conversion if needed. + + This function assumes that is_bool_indexer(key) == True. + + Parameters + ---------- + index : Index + Index of the object on which the indexing is done. + key : list-like + Boolean indexer to check. + + Returns + ------- + np.array + Resulting key. + + Raises + ------ + IndexError + If the key does not have the same length as index. + IndexingError + If the index of the key is unalignable to index. + """ + result = key + if isinstance(key, ABCSeries) and not key.index.equals(index): + indexer = result.index.get_indexer_for(index) + if -1 in indexer: +> raise IndexingError( + "Unalignable boolean Series provided as " + "indexer (index of the boolean Series and of " + "the indexed object do not match)." + ) +E pandas.errors.IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match). + +/home/vscode/.local/lib/python3.10/site-packages/pandas/core/indexing.py:2575: IndexingError +------------------------------ Captured log setup ------------------------------ +INFO distributed.http.proxy:proxy.py:85 To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy +INFO distributed.scheduler:scheduler.py:1711 State start +INFO distributed.scheduler:scheduler.py:4072 Scheduler at: tcp://127.0.0.1:45065 +INFO distributed.scheduler:scheduler.py:4087 dashboard at: http://127.0.0.1:8787/status +INFO distributed.scheduler:scheduler.py:7879 Registering Worker plugin shuffle +INFO distributed.nanny:nanny.py:368 Start Nanny at: 'tcp://127.0.0.1:33669' +INFO distributed.nanny:nanny.py:368 Start Nanny at: 'tcp://127.0.0.1:40145' +INFO distributed.nanny:nanny.py:368 Start Nanny at: 'tcp://127.0.0.1:33031' +INFO distributed.scheduler:scheduler.py:4424 Register worker +INFO distributed.scheduler:scheduler.py:5934 Starting worker compute stream, tcp://127.0.0.1:37897 +INFO distributed.core:core.py:1019 Starting established connection to tcp://127.0.0.1:53954 +INFO distributed.scheduler:scheduler.py:4424 Register worker +INFO distributed.scheduler:scheduler.py:5934 Starting worker compute stream, tcp://127.0.0.1:43287 +INFO distributed.core:core.py:1019 Starting established connection to tcp://127.0.0.1:53964 +INFO distributed.scheduler:scheduler.py:4424 Register worker +INFO distributed.scheduler:scheduler.py:5934 Starting worker compute stream, tcp://127.0.0.1:39929 +INFO distributed.core:core.py:1019 Starting established connection to tcp://127.0.0.1:53972 +INFO distributed.scheduler:scheduler.py:5691 Receive client connection: Client-f60972f6-2885-11ef-9469-00224879e9b9 +INFO distributed.core:core.py:1019 Starting established connection to tcp://127.0.0.1:53982 +----------------------------- Captured stderr call ----------------------------- +/home/vscode/.local/lib/python3.10/site-packages/dask/core.py:127: UserWarning: Boolean Series key will be reindexed to match DataFrame index. + return func(*(_execute_task(a, cache) for a in args)) +2024-06-12 06:35:33,039 - distributed.worker - WARNING - Compute Failed +Key: ('operation-fused-cast_df-54ec80aac981a3f77a9a8ded7f391d20', 0) +Function: execute_task +args: ((, {'operation-fused-cast_df-54ec80aac981a3f77a9a8ded7f391d20': ('cast_df-47756cff5a2ca85eca21dc71eb9af42b', 0), ('cast_df-47756cff5a2ca85eca21dc71eb9af42b', 0): (, , [('operation-91c288ed66d23ec6ec6c9eac97efd7e3', 0)], {'schema': a: double +c: double, 'use_extension_types': True, 'use_arrow_dtype': True, '_func': >, '_meta': Empty DataFrame +Columns: [a, c] +Index: []}), ('operation-91c288ed66d23ec6ec6c9eac97efd7e3', 0): (, ('reset_index-91a375bc8c1254295ada72ea3f88a352', 0), ['a', 'c']), ('reset_index-91a375bc8c1254295ada72ea3f88a352', 0): (, , [('getitem-46689d9bcc98f0ef115dccb4f41fbf9a', 0)], {'drop': False}), ('getitem-46689d9bcc98f0ef115dccb4f4 +kwargs: {} +Exception: "IndexingError('Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).')" + +---------------------------- Captured log teardown ----------------------------- +INFO distributed.scheduler:scheduler.py:5735 Remove client Client-f60972f6-2885-11ef-9469-00224879e9b9 +INFO distributed.core:core.py:1044 Received 'close-stream' from tcp://127.0.0.1:53982; closing. +INFO distributed.scheduler:scheduler.py:5735 Remove client Client-f60972f6-2885-11ef-9469-00224879e9b9 +INFO distributed.scheduler:scheduler.py:5727 Close client connection: Client-f60972f6-2885-11ef-9469-00224879e9b9 +INFO distributed.scheduler:scheduler.py:7322 Retire worker addresses (0, 1, 2) +INFO distributed.nanny:nanny.py:609 Closing Nanny at 'tcp://127.0.0.1:33669'. Reason: nanny-close +INFO distributed.nanny:nanny.py:854 Nanny asking worker to close. Reason: nanny-close +INFO distributed.nanny:nanny.py:609 Closing Nanny at 'tcp://127.0.0.1:40145'. Reason: nanny-close +INFO distributed.nanny:nanny.py:854 Nanny asking worker to close. Reason: nanny-close +INFO distributed.nanny:nanny.py:609 Closing Nanny at 'tcp://127.0.0.1:33031'. Reason: nanny-close +INFO distributed.nanny:nanny.py:854 Nanny asking worker to close. Reason: nanny-close +INFO distributed.core:core.py:1044 Received 'close-stream' from tcp://127.0.0.1:53972; closing. +INFO distributed.scheduler:scheduler.py:5209 Remove worker (stimulus_id='handle-worker-cleanup-1718174133.9382055') +INFO distributed.core:core.py:1044 Received 'close-stream' from tcp://127.0.0.1:53964; closing. +INFO distributed.core:core.py:1044 Received 'close-stream' from tcp://127.0.0.1:53954; closing. +INFO distributed.scheduler:scheduler.py:5209 Remove worker (stimulus_id='handle-worker-cleanup-1718174133.9406834') +INFO distributed.scheduler:scheduler.py:5209 Remove worker (stimulus_id='handle-worker-cleanup-1718174133.9411561') +INFO distributed.scheduler:scheduler.py:5336 Lost all workers +INFO distributed.scheduler:scheduler.py:4146 Scheduler closing due to unknown reason... +INFO distributed.scheduler:scheduler.py:4164 Scheduler closing all comms +=============================== warnings summary =============================== +../../usr/local/py-utils/venvs/pytest/lib/python3.10/site-packages/_pytest/config/__init__.py:1448 + /usr/local/py-utils/venvs/pytest/lib/python3.10/site-packages/_pytest/config/__init__.py:1448: PytestConfigWarning: Unknown config option: spark_options + + self._warn_or_fail_if_strict(f"Unknown config option: {key}\n") + +fugue_sql/__init__.py:8 + /workspaces/fugue/fugue_sql/__init__.py:8: UserWarning: fsql and FugueSQLWorkflow now should be imported directly from fugue, fugue_sql will be removed in 0.9.0 + warnings.warn( + +tests/fugue_dask/test_execution_engine.py::DaskExecutionEngineTests::test_select[dask] +tests/fugue_dask/test_execution_engine.py::DaskExecutionEngineTests::test_select[dask] + /home/vscode/.local/lib/python3.10/site-packages/dask_expr/_expr.py:496: UserWarning: Boolean Series key will be reindexed to match DataFrame index. + return self.operation(*args, **self._kwargs) + +-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html + +---------- coverage: platform linux, python 3.10.14-final-0 ---------- +Name Stmts Miss Cover Missing +------------------------------------------------------------------------------- +fugue/__init__.py 33 33 0% 2-90 +fugue/_utils/display.py 53 53 0% 1-86 +fugue/_utils/exception.py 37 37 0% 1-77 +fugue/_utils/interfaceless.py 25 25 0% 1-73 +fugue/_utils/io.py 176 176 0% 1-290 +fugue/_utils/misc.py 13 13 0% 1-26 +fugue/_utils/registry.py 6 6 0% 1-10 +fugue/api.py 5 5 0% 3-71 +fugue/bag/__init__.py 1 1 0% 2 +fugue/bag/array_bag.py 28 28 0% 1-39 +fugue/bag/bag.py 46 46 0% 1-107 +fugue/collections/partition.py 197 197 0% 1-469 +fugue/collections/sql.py 53 53 0% 1-151 +fugue/collections/yielded.py 26 26 0% 1-74 +fugue/column/__init__.py 3 3 0% 2-4 +fugue/column/expressions.py 282 282 0% 1-856 +fugue/column/functions.py 48 48 0% 1-370 +fugue/column/sql.py 252 252 0% 1-497 +fugue/constants.py 28 28 0% 1-92 +fugue/dataframe/__init__.py 10 10 0% 2-21 +fugue/dataframe/api.py 68 68 0% 1-340 +fugue/dataframe/array_dataframe.py 75 75 0% 1-128 +fugue/dataframe/arrow_dataframe.py 183 183 0% 1-342 +fugue/dataframe/dataframe.py 178 178 0% 1-505 +fugue/dataframe/dataframe_iterable_dataframe.py 106 106 0% 1-208 +fugue/dataframe/dataframes.py 49 49 0% 1-109 +fugue/dataframe/function_wrapper.py 274 274 0% 1-460 +fugue/dataframe/iterable_dataframe.py 80 80 0% 1-138 +fugue/dataframe/pandas_dataframe.py 186 186 0% 1-355 +fugue/dataframe/utils.py 113 113 0% 1-291 +fugue/dataset/__init__.py 2 2 0% 2-3 +fugue/dataset/api.py 31 31 0% 1-100 +fugue/dataset/dataset.py 36 36 0% 1-148 +fugue/dev.py 14 14 0% 7-47 +fugue/exceptions.py 17 17 0% 1-65 +fugue/execution/__init__.py 4 4 0% 2-13 +fugue/execution/api.py 128 128 0% 1-1219 +fugue/execution/execution_engine.py 296 296 0% 1-1360 +fugue/execution/factory.py 83 83 0% 1-577 +fugue/execution/native_execution_engine.py 175 175 0% 1-422 +fugue/extensions/__init__.py 5 5 0% 2-6 +fugue/extensions/_builtins/__init__.py 3 3 0% 2-10 +fugue/extensions/_builtins/creators.py 34 34 0% 1-54 +fugue/extensions/_builtins/outputters.py 111 111 0% 1-176 +fugue/extensions/_builtins/processors.py 228 228 0% 1-375 +fugue/extensions/_utils.py 71 71 0% 1-143 +fugue/extensions/context.py 54 54 0% 1-121 +fugue/extensions/creator/__init__.py 2 2 0% 2-8 +fugue/extensions/creator/convert.py 71 71 0% 1-228 +fugue/extensions/creator/creator.py 4 4 0% 1-7 +fugue/extensions/outputter/__init__.py 2 2 0% 2-8 +fugue/extensions/outputter/convert.py 73 73 0% 1-217 +fugue/extensions/outputter/outputter.py 4 4 0% 1-7 +fugue/extensions/processor/__init__.py 2 2 0% 2-8 +fugue/extensions/processor/convert.py 83 83 0% 1-253 +fugue/extensions/processor/processor.py 4 4 0% 1-7 +fugue/extensions/transformer/__init__.py 2 2 0% 2-14 +fugue/extensions/transformer/constants.py 1 1 0% 1 +fugue/extensions/transformer/convert.py 254 254 0% 1-688 +fugue/extensions/transformer/transformer.py 20 20 0% 1-210 +fugue/plugins.py 9 9 0% 3-43 +fugue/registry.py 9 9 0% 1-30 +fugue/rpc/__init__.py 1 1 0% 2 +fugue/rpc/base.py 121 121 0% 1-280 +fugue/rpc/flask.py 62 62 0% 1-131 +fugue/sql/_utils.py 37 37 0% 1-67 +fugue/sql/_visitors.py 534 534 0% 3-913 +fugue/sql/api.py 29 29 0% 1-287 +fugue/sql/workflow.py 47 47 0% 1-82 +fugue/test/__init__.py 2 2 0% 2-3 +fugue/test/pandas_tester.py 15 15 0% 1-23 +fugue/test/plugins.py 131 131 0% 1-393 +fugue/workflow/__init__.py 5 5 0% 3-7 +fugue/workflow/_checkpoint.py 87 87 0% 1-173 +fugue/workflow/_tasks.py 179 179 0% 1-347 +fugue/workflow/_workflow_context.py 47 47 0% 1-78 +fugue/workflow/api.py 72 72 0% 1-307 +fugue/workflow/input.py 1 1 0% 1 +fugue/workflow/module.py 93 93 0% 1-165 +fugue/workflow/workflow.py 421 421 0% 1-2302 +fugue_dask/_io.py 109 86 21% 24-33, 44-57, 61-66, 72-92, 96-97, 103-106, 114-144, 148-149, 153-156, 164-170 +fugue_dask/_utils.py 118 74 37% 39-41, 53-59, 78-90, 111-121, 125-130, 136-143, 149-169, 175-188, 192-216, 221, 226-227 +fugue_dask/dataframe.py 222 105 53% 66-67, 69-71, 74, 78-84, 92, 103, 107, 112, 117, 121, 125, 128-129, 132-133, 136-137, 140-142, 145, 154-155, 166-171, 176, 182-186, 191-193, 198, 203, 208-210, 240, 245, 250, 255, 260, 265, 270, 275, 280-283, 290-295, 302-305, 315-318, 325-329, 336-338, 345-350, 357-360, 364-366, 370, 379-392 +fugue_dask/execution_engine.py 251 149 41% 61, 91, 95, 106-184, 214, 218, 233, 242, 269-306, 309, 317-319, 328-345, 353-360, 368-378, 386-396, 399-400, 409-417, 420-437, 447-455, 465-526, 535, 549-565, 596, 601 +fugue_dask/registry.py 34 8 76% 29, 34, 50, 61-62, 65-67 +fugue_duckdb/_io.py 144 124 14% 22-26, 31-33, 42-57, 67-85, 88-98, 103-172, 175-187, 192-207 +fugue_duckdb/_utils.py 97 80 18% 38, 42-43, 47, 51-79, 83-96, 102-131, 140-158 +fugue_duckdb/dask.py 66 40 39% 33-34, 37, 40, 44, 47-58, 61-64, 67-69, 77-79, 91-102, 113-115, 120-128, 131-135 +fugue_duckdb/dataframe.py 130 66 49% 45-46, 50, 55, 58, 62, 65-68, 71, 74, 77, 80-88, 91-104, 107, 110, 113-116, 121, 126, 131, 136, 141-143, 148, 153, 158, 163, 168-170, 175-176, 181, 186, 193-196, 203-212, 219, 228, 235, 244, 248-250 +fugue_duckdb/execution_engine.py 272 212 22% 44, 47-51, 54, 64-77, 84-85, 93, 96-104, 107-113, 116-130, 148-166, 169-176, 179-180, 183, 187, 191, 195, 198, 201, 204, 207, 216, 225-231, 240-307, 310, 313-320, 338-342, 348-349, 358-372, 375-399, 409-423, 433-477, 486-487, 499-503, 506-510, 513-517, 523-554 +fugue_duckdb/registry.py 51 9 82% 34, 90, 106-107, 113-114, 117-119 +fugue_duckdb/tester.py 35 12 66% 24-25, 36-38, 43-49 +fugue_ibis/_utils.py 109 81 26% 61-62, 69-77, 81-98, 102-117, 125-126, 132-134, 137-138, 143-145, 148, 158-166, 184-211 +fugue_ibis/dataframe.py 143 87 39% 24-29, 39, 42, 45, 61, 65, 69, 73, 81, 84-87, 90, 93-94, 97-98, 101-106, 109-112, 118, 121, 124-127, 132-134, 139-142, 147-149, 154-157, 162-164, 167-181, 186, 191, 196-201, 206-209, 213-221, 225-227 +fugue_ibis/execution_engine.py 243 175 28% 42, 50-58, 65, 91-123, 126-129, 134-137, 142-145, 148-150, 159-177, 180-207, 217-269, 272, 282-287, 292, 298, 310, 313-314, 318, 329-330, 359-360, 368, 372, 376, 379, 383, 386, 389-391, 394-397, 400-402, 410-412, 421-423, 426-428, 433-435, 440-442, 445-447, 456-460, 463-465, 475-483, 499-507, 522 +fugue_polars/_utils.py 4 1 75% 6 +fugue_polars/polars_dataframe.py 147 74 50% 63-74, 79, 82, 86, 89-90, 93-94, 97, 100, 105-110, 113, 116, 119, 122-123, 126, 131, 136, 139, 144, 149, 154, 159-161, 166, 171, 176, 181, 186, 191, 196, 201-203, 208-214, 219-224, 229-232, 239-241, 248-249, 256-257, 264-265, 269-271 +fugue_polars/registry.py 45 21 53% 21, 27, 30-31, 37, 47-48, 57-70, 77, 81-86 +fugue_ray/_utils/cluster.py 9 4 56% 8-11, 15-16 +fugue_ray/_utils/dataframe.py 37 23 38% 15-16, 21-26, 30-38, 52, 58-99, 113-120 +fugue_ray/_utils/io.py 119 88 26% 27-34, 47-57, 69-84, 92, 100-107, 117, 123-128, 135-141, 146-203, 208-247, 250 +fugue_ray/dataframe.py 208 140 33% 55-107, 114, 117, 121, 124-131, 135, 139, 143, 146-147, 150-152, 159-162, 166-167, 170, 173, 176, 179-190, 193-205, 210, 215, 218, 223, 228-231, 236-248, 256, 261, 266, 271, 276-277, 289-296, 303, 308, 315-318, 325, 332-335, 342, 346-356 +fugue_ray/execution_engine.py 136 93 32% 35, 39, 50-59, 75-144, 154-201, 218-219, 222, 226, 229, 232-236, 239, 242-262, 265, 273-275, 279-281, 284-291, 300, 314-316, 328-331, 335-341, 344-351 +fugue_ray/registry.py 32 7 78% 24, 29, 46-47, 50-52 +fugue_ray/tester.py 14 2 86% 21-22 +fugue_spark/_utils/convert.py 133 99 26% 38-44, 50-55, 59-64, 70-106, 110-116, 120-135, 144-163, 169-170, 177-181, 194-211, 220-237, 241-246, 250-268, 272 +fugue_spark/_utils/io.py 96 77 20% 19-20, 33-43, 55-65, 71-79, 84-90, 95-129, 132-139 +fugue_spark/_utils/misc.py 11 2 82% 13, 25 +fugue_spark/_utils/partition.py 51 37 27% 17-23, 29-35, 41-78, 82, 100-117 +fugue_spark/dataframe.py 199 114 43% 67-77, 85, 93, 96, 100, 104, 107-110, 114, 118, 121-122, 125-128, 131-132, 135-136, 139, 142, 145-149, 152-155, 160, 165, 170, 175, 180-181, 187-192, 195-197, 202, 207, 212, 217, 222, 227, 232, 237, 242, 247, 254-257, 264-269, 276-279, 289-292, 299-301, 311-324, 331-333, 340-344, 348-354, 358-360, 364 +fugue_spark/execution_engine.py 326 243 25% 82, 86, 90, 93-98, 106, 111, 122-135, 146-190, 201-235, 246-334, 349-369, 372, 380-383, 387, 391, 395, 398, 401, 404-414, 442, 445-476, 479-481, 489-493, 502-516, 524-533, 538-548, 553-563, 566-567, 576-577, 580-597, 607-632, 642-698, 707, 721-723, 734-735, 738-747, 753-817 +fugue_spark/registry.py 74 25 66% 42-47, 52, 57-60, 90-91, 97-98, 104-105, 108-110, 119-120, 123-125, 131 +fugue_spark/tester.py 27 7 74% 32, 37-38, 75-78 +fugue_test/bag_suite.py 62 62 0% 4-84 +fugue_test/builtin_suite.py 1174 1174 0% 2-2018 +fugue_test/dataframe_suite.py 278 278 0% 3-462 +fugue_test/execution_suite.py 571 571 0% 2-1299 +------------------------------------------------------------------------------- +TOTAL 12184 10800 11% + +16 files skipped due to complete coverage. + +=========================== short test summary info ============================ +FAILED tests/fugue_dask/test_execution_engine.py::DaskExecutionEngineTests::test_select[dask] - pandas.errors.IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match). +======================== 1 failed, 4 warnings in 9.53s ========================= From d20bfa017ea348ccd93603a6915cf2896f0d1472 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 13 Jun 2024 06:19:46 +0000 Subject: [PATCH 5/8] update --- tests/fugue_dask/test_execution_engine.py | 36 +++++++++++++++++++++++ tests/fugue_ibis/mock/execution_engine.py | 16 +++++----- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/tests/fugue_dask/test_execution_engine.py b/tests/fugue_dask/test_execution_engine.py index fd20cdee..2a93772d 100644 --- a/tests/fugue_dask/test_execution_engine.py +++ b/tests/fugue_dask/test_execution_engine.py @@ -24,6 +24,8 @@ from fugue_dask.execution_engine import DaskExecutionEngine from fugue_test.builtin_suite import BuiltInTests from fugue_test.execution_suite import ExecutionEngineTests +from fugue.column import col, all_cols +import fugue.column.functions as ff _CONF = { "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer", @@ -50,6 +52,40 @@ def test_get_parallelism(self): def test__join_outer_pandas_incompatible(self): return + # TODO: dask-sql 2024.5.0 has a bug, can't pass the HAVING tests + def test_select(self): + a = ArrayDataFrame( + [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" + ) + + # simple + b = fa.select(a, col("b"), (col("b") + 1).alias("c").cast(str)) + self.df_eq( + b, + [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]], + "b:int,c:str", + throw=True, + ) + + # with distinct + b = fa.select( + a, col("b"), (col("b") + 1).alias("c").cast(str), distinct=True + ) + self.df_eq( + b, + [[2, "3"], [1, "2"], [4, "5"]], + "b:int,c:str", + throw=True, + ) + + # wildcard + b = fa.select(a, all_cols(), where=col("a") + col("b") == 3) + self.df_eq(b, [[1, 2]], "a:double,b:int", throw=True) + + # aggregation + b = fa.select(a, col("a"), ff.sum(col("b")).cast(float).alias("b")) + self.df_eq(b, [[1, 2], [3, 4], [None, 7]], "a:double,b:double", throw=True) + def test_to_df(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") diff --git a/tests/fugue_ibis/mock/execution_engine.py b/tests/fugue_ibis/mock/execution_engine.py index fc0ffa05..5a5770fb 100644 --- a/tests/fugue_ibis/mock/execution_engine.py +++ b/tests/fugue_ibis/mock/execution_engine.py @@ -81,15 +81,17 @@ def sample( f"one and only one of n and frac should be non-negative, {n}, {frac}" ), ) - tn = self.get_temp_table_name() + idf = self.to_df(df) + tn = f"({idf.native.compile()})" + if seed is not None: + _seed = f",{seed}" + else: + _seed = "" if frac is not None: - sql = f"SELECT * FROM {tn} USING SAMPLE bernoulli({frac*100} PERCENT)" + sql = f"SELECT * FROM {tn} USING SAMPLE {frac*100}% (bernoulli{_seed})" else: - sql = f"SELECT * FROM {tn} USING SAMPLE reservoir({n} ROWS)" - if seed is not None: - sql += f" REPEATABLE ({seed})" - idf = self.to_df(df) - _res = f"WITH {tn} AS ({idf.native.compile()}) " + sql + sql = f"SELECT * FROM {tn} USING SAMPLE {n} ROWS (reservoir{_seed})" + _res = f"SELECT * FROM ({sql})" # ibis has a bug to inject LIMIT return self.to_df(self.backend.sql(_res)) def _register_df( From f3abb68cecf14237981831264abc34a65e7d4a57 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 13 Jun 2024 06:26:45 +0000 Subject: [PATCH 6/8] update --- tests/fugue_dask/test_execution_engine.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/fugue_dask/test_execution_engine.py b/tests/fugue_dask/test_execution_engine.py index 2a93772d..60f1121f 100644 --- a/tests/fugue_dask/test_execution_engine.py +++ b/tests/fugue_dask/test_execution_engine.py @@ -54,6 +54,11 @@ def test__join_outer_pandas_incompatible(self): # TODO: dask-sql 2024.5.0 has a bug, can't pass the HAVING tests def test_select(self): + try: + import dask_sql + except ImportError: + return + a = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" ) From 1100a26d49329d609778eda5333045cc5fdf3126 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 13 Jun 2024 06:37:57 +0000 Subject: [PATCH 7/8] update --- x.txt | 2868 --------------------------------------------------------- 1 file changed, 2868 deletions(-) delete mode 100644 x.txt diff --git a/x.txt b/x.txt deleted file mode 100644 index 665d9bf3..00000000 --- a/x.txt +++ /dev/null @@ -1,2868 +0,0 @@ -============================= test session starts ============================== -platform linux -- Python 3.10.14, pytest-8.2.1, pluggy-1.5.0 -- /usr/local/py-utils/venvs/pytest/bin/python -cachedir: .pytest_cache -Fugue tests will be initialized with options: - fugue.test.dummy = dummy - fugue.test = True - ray.num_cpus = 2 - dask.processes = True - dask.n_workers = 3 - dask.threads_per_worker = 1 -rootdir: /workspaces/fugue -configfile: setup.cfg -plugins: cov-5.0.0, rerunfailures-10.2, fugue-0.9.1, mock-3.14.0, anyio-4.4.0 -collecting ... collected 1 item - -tests/fugue_dask/test_execution_engine.py::DaskExecutionEngineTests::test_select[dask] <- ../../home/vscode/.local/lib/python3.10/site-packages/fugue_test/execution_suite.py FAILED [100%] - -=================================== FAILURES =================================== -__________________ DaskExecutionEngineTests.test_select[dask] __________________ - -self = - - @pytest.mark.skipif(not HAS_QPD, reason="qpd not working") - def test_select(self): - a = ArrayDataFrame( - [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" - ) - - # simple - b = fa.select(a, col("b"), (col("b") + 1).alias("c").cast(str)) - self.df_eq( - b, - [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]], - "b:int,c:str", - throw=True, - ) - - # with distinct - b = fa.select( - a, col("b"), (col("b") + 1).alias("c").cast(str), distinct=True - ) - self.df_eq( - b, - [[2, "3"], [1, "2"], [4, "5"]], - "b:int,c:str", - throw=True, - ) - - # wildcard - b = fa.select(a, all_cols(), where=col("a") + col("b") == 3) - self.df_eq(b, [[1, 2]], "a:double,b:int", throw=True) - - # aggregation - b = fa.select(a, col("a"), ff.sum(col("b")).cast(float).alias("b")) - self.df_eq(b, [[1, 2], [3, 4], [None, 7]], "a:double,b:double", throw=True) - - # having - # https://github.com/fugue-project/fugue/issues/222 - col_b = ff.sum(col("b")) - b = fa.select( - a, - col("a"), - col_b.cast(float).alias("c"), - having=(col_b >= 7) | (col("a") == 1), - ) -> self.df_eq(b, [[1, 2], [None, 7]], "a:double,c:double", throw=True) - -/home/vscode/.local/lib/python3.10/site-packages/fugue_test/execution_suite.py:139: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ -/home/vscode/.local/lib/python3.10/site-packages/fugue/test/plugins.py:191: in df_eq - return _df_eq(*args, **kwargs) -/home/vscode/.local/lib/python3.10/site-packages/fugue/dataframe/utils.py:58: in _df_eq - df1 = as_fugue_df(df).as_local_bounded() -fugue_dask/dataframe.py:110: in as_local_bounded - res = PandasDataFrame(self.as_pandas(), self.schema) -fugue_dask/dataframe.py:148: in as_pandas - pdf = self.native.compute().reset_index(drop=True) -/home/vscode/.local/lib/python3.10/site-packages/dask_expr/_collection.py:476: in compute - return DaskMethodsMixin.compute(out, **kwargs) -/home/vscode/.local/lib/python3.10/site-packages/dask/base.py:375: in compute - (result,) = compute(self, traverse=False, **kwargs) -/home/vscode/.local/lib/python3.10/site-packages/dask/base.py:661: in compute - results = schedule(dsk, keys, **kwargs) -/home/vscode/.local/lib/python3.10/site-packages/dask_expr/_expr.py:3727: in _execute_task - return dask.core.get(graph, name) -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - - from __future__ import annotations - - from contextlib import suppress - import sys - from typing import ( - TYPE_CHECKING, - cast, - final, - ) - import warnings - - import numpy as np - - from pandas._config import using_copy_on_write - - from pandas._libs.indexing import NDFrameIndexerBase - from pandas._libs.lib import item_from_zerodim - from pandas.compat import PYPY - from pandas.errors import ( - AbstractMethodError, - ChainedAssignmentError, - IndexingError, - InvalidIndexError, - LossySetitemError, - _chained_assignment_msg, - ) - from pandas.util._decorators import doc - - from pandas.core.dtypes.cast import ( - can_hold_element, - maybe_promote, - ) - from pandas.core.dtypes.common import ( - is_array_like, - is_bool_dtype, - is_hashable, - is_integer, - is_iterator, - is_list_like, - is_numeric_dtype, - is_object_dtype, - is_scalar, - is_sequence, - ) - from pandas.core.dtypes.concat import concat_compat - from pandas.core.dtypes.dtypes import ExtensionDtype - from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCSeries, - ) - from pandas.core.dtypes.missing import ( - infer_fill_value, - is_valid_na_for_dtype, - isna, - na_value_for_dtype, - ) - - from pandas.core import algorithms as algos - import pandas.core.common as com - from pandas.core.construction import ( - array as pd_array, - extract_array, - ) - from pandas.core.indexers import ( - check_array_indexer, - is_list_like_indexer, - is_scalar_indexer, - length_of_indexer, - ) - from pandas.core.indexes.api import ( - Index, - MultiIndex, - ) - - if TYPE_CHECKING: - from collections.abc import ( - Hashable, - Sequence, - ) - - from pandas._typing import ( - Axis, - AxisInt, - Self, - ) - - from pandas import ( - DataFrame, - Series, - ) - - # "null slice" - _NS = slice(None, None) - _one_ellipsis_message = "indexer may only contain one '...' entry" - - - # the public IndexSlicerMaker - class _IndexSlice: - """ - Create an object to more easily perform multi-index slicing. - - See Also - -------- - MultiIndex.remove_unused_levels : New MultiIndex with no unused levels. - - Notes - ----- - See :ref:`Defined Levels ` - for further info on slicing a MultiIndex. - - Examples - -------- - >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) - >>> columns = ['foo', 'bar'] - >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), - ... index=midx, columns=columns) - - Using the default slice command: - - >>> dfmi.loc[(slice(None), slice('B0', 'B1')), :] - foo bar - A0 B0 0 1 - B1 2 3 - A1 B0 8 9 - B1 10 11 - - Using the IndexSlice class for a more intuitive command: - - >>> idx = pd.IndexSlice - >>> dfmi.loc[idx[:, 'B0':'B1'], :] - foo bar - A0 B0 0 1 - B1 2 3 - A1 B0 8 9 - B1 10 11 - """ - - def __getitem__(self, arg): - return arg - - - IndexSlice = _IndexSlice() - - - class IndexingMixin: - """ - Mixin for adding .loc/.iloc/.at/.iat to Dataframes and Series. - """ - - @property - def iloc(self) -> _iLocIndexer: - """ - Purely integer-location based indexing for selection by position. - - ``.iloc[]`` is primarily integer position based (from ``0`` to - ``length-1`` of the axis), but may also be used with a boolean - array. - - Allowed inputs are: - - - An integer, e.g. ``5``. - - A list or array of integers, e.g. ``[4, 3, 0]``. - - A slice object with ints, e.g. ``1:7``. - - A boolean array. - - A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above). - This is useful in method chains, when you don't have a reference to the - calling object, but would like to base your selection on some value. - - A tuple of row and column indexes. The tuple elements consist of one of the - above inputs, e.g. ``(0, 1)``. - - ``.iloc`` will raise ``IndexError`` if a requested indexer is - out-of-bounds, except *slice* indexers which allow out-of-bounds - indexing (this conforms with python/numpy *slice* semantics). - - See more at :ref:`Selection by Position `. - - See Also - -------- - DataFrame.iat : Fast integer location scalar accessor. - DataFrame.loc : Purely label-location based indexer for selection by label. - Series.iloc : Purely integer-location based indexing for - selection by position. - - Examples - -------- - >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, - ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, - ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] - >>> df = pd.DataFrame(mydict) - >>> df - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - 2 1000 2000 3000 4000 - - **Indexing just the rows** - - With a scalar integer. - - >>> type(df.iloc[0]) - - >>> df.iloc[0] - a 1 - b 2 - c 3 - d 4 - Name: 0, dtype: int64 - - With a list of integers. - - >>> df.iloc[[0]] - a b c d - 0 1 2 3 4 - >>> type(df.iloc[[0]]) - - - >>> df.iloc[[0, 1]] - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - - With a `slice` object. - - >>> df.iloc[:3] - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - 2 1000 2000 3000 4000 - - With a boolean mask the same length as the index. - - >>> df.iloc[[True, False, True]] - a b c d - 0 1 2 3 4 - 2 1000 2000 3000 4000 - - With a callable, useful in method chains. The `x` passed - to the ``lambda`` is the DataFrame being sliced. This selects - the rows whose index label even. - - >>> df.iloc[lambda x: x.index % 2 == 0] - a b c d - 0 1 2 3 4 - 2 1000 2000 3000 4000 - - **Indexing both axes** - - You can mix the indexer types for the index and columns. Use ``:`` to - select the entire axis. - - With scalar integers. - - >>> df.iloc[0, 1] - 2 - - With lists of integers. - - >>> df.iloc[[0, 2], [1, 3]] - b d - 0 2 4 - 2 2000 4000 - - With `slice` objects. - - >>> df.iloc[1:3, 0:3] - a b c - 1 100 200 300 - 2 1000 2000 3000 - - With a boolean array whose length matches the columns. - - >>> df.iloc[:, [True, False, True, False]] - a c - 0 1 3 - 1 100 300 - 2 1000 3000 - - With a callable function that expects the Series or DataFrame. - - >>> df.iloc[:, lambda df: [0, 2]] - a c - 0 1 3 - 1 100 300 - 2 1000 3000 - """ - return _iLocIndexer("iloc", self) - - @property - def loc(self) -> _LocIndexer: - """ - Access a group of rows and columns by label(s) or a boolean array. - - ``.loc[]`` is primarily label based, but may also be used with a - boolean array. - - Allowed inputs are: - - - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is - interpreted as a *label* of the index, and **never** as an - integer position along the index). - - A list or array of labels, e.g. ``['a', 'b', 'c']``. - - A slice object with labels, e.g. ``'a':'f'``. - - .. warning:: Note that contrary to usual python slices, **both** the - start and the stop are included - - - A boolean array of the same length as the axis being sliced, - e.g. ``[True, False, True]``. - - An alignable boolean Series. The index of the key will be aligned before - masking. - - An alignable Index. The Index of the returned selection will be the input. - - A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above) - - See more at :ref:`Selection by Label `. - - Raises - ------ - KeyError - If any items are not found. - IndexingError - If an indexed key is passed and its index is unalignable to the frame index. - - See Also - -------- - DataFrame.at : Access a single value for a row/column label pair. - DataFrame.iloc : Access group of rows and columns by integer position(s). - DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. - Series.loc : Access group of values using labels. - - Examples - -------- - **Getting values** - - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=['cobra', 'viper', 'sidewinder'], - ... columns=['max_speed', 'shield']) - >>> df - max_speed shield - cobra 1 2 - viper 4 5 - sidewinder 7 8 - - Single label. Note this returns the row as a Series. - - >>> df.loc['viper'] - max_speed 4 - shield 5 - Name: viper, dtype: int64 - - List of labels. Note using ``[[]]`` returns a DataFrame. - - >>> df.loc[['viper', 'sidewinder']] - max_speed shield - viper 4 5 - sidewinder 7 8 - - Single label for row and column - - >>> df.loc['cobra', 'shield'] - 2 - - Slice with labels for row and single label for column. As mentioned - above, note that both the start and stop of the slice are included. - - >>> df.loc['cobra':'viper', 'max_speed'] - cobra 1 - viper 4 - Name: max_speed, dtype: int64 - - Boolean list with the same length as the row axis - - >>> df.loc[[False, False, True]] - max_speed shield - sidewinder 7 8 - - Alignable boolean Series: - - >>> df.loc[pd.Series([False, True, False], - ... index=['viper', 'sidewinder', 'cobra'])] - max_speed shield - sidewinder 7 8 - - Index (same behavior as ``df.reindex``) - - >>> df.loc[pd.Index(["cobra", "viper"], name="foo")] - max_speed shield - foo - cobra 1 2 - viper 4 5 - - Conditional that returns a boolean Series - - >>> df.loc[df['shield'] > 6] - max_speed shield - sidewinder 7 8 - - Conditional that returns a boolean Series with column labels specified - - >>> df.loc[df['shield'] > 6, ['max_speed']] - max_speed - sidewinder 7 - - Multiple conditional using ``&`` that returns a boolean Series - - >>> df.loc[(df['max_speed'] > 1) & (df['shield'] < 8)] - max_speed shield - viper 4 5 - - Multiple conditional using ``|`` that returns a boolean Series - - >>> df.loc[(df['max_speed'] > 4) | (df['shield'] < 5)] - max_speed shield - cobra 1 2 - sidewinder 7 8 - - Please ensure that each condition is wrapped in parentheses ``()``. - See the :ref:`user guide` - for more details and explanations of Boolean indexing. - - .. note:: - If you find yourself using 3 or more conditionals in ``.loc[]``, - consider using :ref:`advanced indexing`. - - See below for using ``.loc[]`` on MultiIndex DataFrames. - - Callable that returns a boolean Series - - >>> df.loc[lambda df: df['shield'] == 8] - max_speed shield - sidewinder 7 8 - - **Setting values** - - Set value for all items matching the list of labels - - >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 - >>> df - max_speed shield - cobra 1 2 - viper 4 50 - sidewinder 7 50 - - Set value for an entire row - - >>> df.loc['cobra'] = 10 - >>> df - max_speed shield - cobra 10 10 - viper 4 50 - sidewinder 7 50 - - Set value for an entire column - - >>> df.loc[:, 'max_speed'] = 30 - >>> df - max_speed shield - cobra 30 10 - viper 30 50 - sidewinder 30 50 - - Set value for rows matching callable condition - - >>> df.loc[df['shield'] > 35] = 0 - >>> df - max_speed shield - cobra 30 10 - viper 0 0 - sidewinder 0 0 - - Add value matching location - - >>> df.loc["viper", "shield"] += 5 - >>> df - max_speed shield - cobra 30 10 - viper 0 5 - sidewinder 0 0 - - Setting using a ``Series`` or a ``DataFrame`` sets the values matching the - index labels, not the index positions. - - >>> shuffled_df = df.loc[["viper", "cobra", "sidewinder"]] - >>> df.loc[:] += shuffled_df - >>> df - max_speed shield - cobra 60 20 - viper 0 10 - sidewinder 0 0 - - **Getting values on a DataFrame with an index that has integer labels** - - Another example using integers for the index - - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=[7, 8, 9], columns=['max_speed', 'shield']) - >>> df - max_speed shield - 7 1 2 - 8 4 5 - 9 7 8 - - Slice with integer labels for rows. As mentioned above, note that both - the start and stop of the slice are included. - - >>> df.loc[7:9] - max_speed shield - 7 1 2 - 8 4 5 - 9 7 8 - - **Getting values with a MultiIndex** - - A number of examples using a DataFrame with a MultiIndex - - >>> tuples = [ - ... ('cobra', 'mark i'), ('cobra', 'mark ii'), - ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), - ... ('viper', 'mark ii'), ('viper', 'mark iii') - ... ] - >>> index = pd.MultiIndex.from_tuples(tuples) - >>> values = [[12, 2], [0, 4], [10, 20], - ... [1, 4], [7, 1], [16, 36]] - >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) - >>> df - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - mark iii 16 36 - - Single label. Note this returns a DataFrame with a single index. - - >>> df.loc['cobra'] - max_speed shield - mark i 12 2 - mark ii 0 4 - - Single index tuple. Note this returns a Series. - - >>> df.loc[('cobra', 'mark ii')] - max_speed 0 - shield 4 - Name: (cobra, mark ii), dtype: int64 - - Single label for row and column. Similar to passing in a tuple, this - returns a Series. - - >>> df.loc['cobra', 'mark i'] - max_speed 12 - shield 2 - Name: (cobra, mark i), dtype: int64 - - Single tuple. Note using ``[[]]`` returns a DataFrame. - - >>> df.loc[[('cobra', 'mark ii')]] - max_speed shield - cobra mark ii 0 4 - - Single tuple for the index with a single label for the column - - >>> df.loc[('cobra', 'mark i'), 'shield'] - 2 - - Slice from index tuple to single label - - >>> df.loc[('cobra', 'mark i'):'viper'] - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - mark iii 16 36 - - Slice from index tuple to index tuple - - >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - - Please see the :ref:`user guide` - for more details and explanations of advanced indexing. - """ - return _LocIndexer("loc", self) - - @property - def at(self) -> _AtIndexer: - """ - Access a single value for a row/column label pair. - - Similar to ``loc``, in that both provide label-based lookups. Use - ``at`` if you only need to get or set a single value in a DataFrame - or Series. - - Raises - ------ - KeyError - * If getting a value and 'label' does not exist in a DataFrame or - Series. - ValueError - * If row/column label pair is not a tuple or if any label from - the pair is not a scalar for DataFrame. - * If label is list-like (*excluding* NamedTuple) for Series. - - See Also - -------- - DataFrame.at : Access a single value for a row/column pair by label. - DataFrame.iat : Access a single value for a row/column pair by integer - position. - DataFrame.loc : Access a group of rows and columns by label(s). - DataFrame.iloc : Access a group of rows and columns by integer - position(s). - Series.at : Access a single value by label. - Series.iat : Access a single value by integer position. - Series.loc : Access a group of rows by label(s). - Series.iloc : Access a group of rows by integer position(s). - - Notes - ----- - See :ref:`Fast scalar value getting and setting ` - for more details. - - Examples - -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... index=[4, 5, 6], columns=['A', 'B', 'C']) - >>> df - A B C - 4 0 2 3 - 5 0 4 1 - 6 10 20 30 - - Get value at specified row/column pair - - >>> df.at[4, 'B'] - 2 - - Set value at specified row/column pair - - >>> df.at[4, 'B'] = 10 - >>> df.at[4, 'B'] - 10 - - Get value within a Series - - >>> df.loc[5].at['B'] - 4 - """ - return _AtIndexer("at", self) - - @property - def iat(self) -> _iAtIndexer: - """ - Access a single value for a row/column pair by integer position. - - Similar to ``iloc``, in that both provide integer-based lookups. Use - ``iat`` if you only need to get or set a single value in a DataFrame - or Series. - - Raises - ------ - IndexError - When integer position is out of bounds. - - See Also - -------- - DataFrame.at : Access a single value for a row/column label pair. - DataFrame.loc : Access a group of rows and columns by label(s). - DataFrame.iloc : Access a group of rows and columns by integer position(s). - - Examples - -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... columns=['A', 'B', 'C']) - >>> df - A B C - 0 0 2 3 - 1 0 4 1 - 2 10 20 30 - - Get value at specified row/column pair - - >>> df.iat[1, 2] - 1 - - Set value at specified row/column pair - - >>> df.iat[1, 2] = 10 - >>> df.iat[1, 2] - 10 - - Get value within a series - - >>> df.loc[0].iat[1] - 2 - """ - return _iAtIndexer("iat", self) - - - class _LocationIndexer(NDFrameIndexerBase): - _valid_types: str - axis: AxisInt | None = None - - # sub-classes need to set _takeable - _takeable: bool - - @final - def __call__(self, axis: Axis | None = None) -> Self: - # we need to return a copy of ourselves - new_self = type(self)(self.name, self.obj) - - if axis is not None: - axis_int_none = self.obj._get_axis_number(axis) - else: - axis_int_none = axis - new_self.axis = axis_int_none - return new_self - - def _get_setitem_indexer(self, key): - """ - Convert a potentially-label-based key into a positional indexer. - """ - if self.name == "loc": - # always holds here bc iloc overrides _get_setitem_indexer - self._ensure_listlike_indexer(key) - - if isinstance(key, tuple): - for x in key: - check_dict_or_set_indexers(x) - - if self.axis is not None: - key = _tupleize_axis_indexer(self.ndim, self.axis, key) - - ax = self.obj._get_axis(0) - - if ( - isinstance(ax, MultiIndex) - and self.name != "iloc" - and is_hashable(key) - and not isinstance(key, slice) - ): - with suppress(KeyError, InvalidIndexError): - # TypeError e.g. passed a bool - return ax.get_loc(key) - - if isinstance(key, tuple): - with suppress(IndexingError): - # suppress "Too many indexers" - return self._convert_tuple(key) - - if isinstance(key, range): - # GH#45479 test_loc_setitem_range_key - key = list(key) - - return self._convert_to_indexer(key, axis=0) - - @final - def _maybe_mask_setitem_value(self, indexer, value): - """ - If we have obj.iloc[mask] = series_or_frame and series_or_frame has the - same length as obj, we treat this as obj.iloc[mask] = series_or_frame[mask], - similar to Series.__setitem__. - - Note this is only for loc, not iloc. - """ - - if ( - isinstance(indexer, tuple) - and len(indexer) == 2 - and isinstance(value, (ABCSeries, ABCDataFrame)) - ): - pi, icols = indexer - ndim = value.ndim - if com.is_bool_indexer(pi) and len(value) == len(pi): - newkey = pi.nonzero()[0] - - if is_scalar_indexer(icols, self.ndim - 1) and ndim == 1: - # e.g. test_loc_setitem_boolean_mask_allfalse - # test_loc_setitem_ndframe_values_alignment - value = self.obj.iloc._align_series(indexer, value) - indexer = (newkey, icols) - - elif ( - isinstance(icols, np.ndarray) - and icols.dtype.kind == "i" - and len(icols) == 1 - ): - if ndim == 1: - # We implicitly broadcast, though numpy does not, see - # github.com/pandas-dev/pandas/pull/45501#discussion_r789071825 - # test_loc_setitem_ndframe_values_alignment - value = self.obj.iloc._align_series(indexer, value) - indexer = (newkey, icols) - - elif ndim == 2 and value.shape[1] == 1: - # test_loc_setitem_ndframe_values_alignment - value = self.obj.iloc._align_frame(indexer, value) - indexer = (newkey, icols) - elif com.is_bool_indexer(indexer): - indexer = indexer.nonzero()[0] - - return indexer, value - - @final - def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: - """ - Ensure that a list-like of column labels are all present by adding them if - they do not already exist. - - Parameters - ---------- - key : list-like of column labels - Target labels. - axis : key axis if known - """ - column_axis = 1 - - # column only exists in 2-dimensional DataFrame - if self.ndim != 2: - return - - orig_key = key - if isinstance(key, tuple) and len(key) > 1: - # key may be a tuple if we are .loc - # if length of key is > 1 set key to column part - key = key[column_axis] - axis = column_axis - - if ( - axis == column_axis - and not isinstance(self.obj.columns, MultiIndex) - and is_list_like_indexer(key) - and not com.is_bool_indexer(key) - and all(is_hashable(k) for k in key) - ): - # GH#38148 - keys = self.obj.columns.union(key, sort=False) - diff = Index(key).difference(self.obj.columns, sort=False) - - if len(diff) and com.is_null_slice(orig_key[0]): - # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" - # is a new column, add the new columns with dtype=np.void - # so that later when we go through setitem_single_column - # we will use isetitem. Without this, the reindex_axis - # below would create float64 columns in this example, which - # would successfully hold 7, so we would end up with the wrong - # dtype. - indexer = np.arange(len(keys), dtype=np.intp) - indexer[len(self.obj.columns) :] = -1 - new_mgr = self.obj._mgr.reindex_indexer( - keys, indexer=indexer, axis=0, only_slice=True, use_na_proxy=True - ) - self.obj._mgr = new_mgr - return - - self.obj._mgr = self.obj._mgr.reindex_axis(keys, axis=0, only_slice=True) - - @final - def __setitem__(self, key, value) -> None: - if not PYPY and using_copy_on_write(): - if sys.getrefcount(self.obj) <= 2: - warnings.warn( - _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 - ) - - check_dict_or_set_indexers(key) - if isinstance(key, tuple): - key = tuple(list(x) if is_iterator(x) else x for x in key) - key = tuple(com.apply_if_callable(x, self.obj) for x in key) - else: - key = com.apply_if_callable(key, self.obj) - indexer = self._get_setitem_indexer(key) - self._has_valid_setitem_indexer(key) - - iloc = self if self.name == "iloc" else self.obj.iloc - iloc._setitem_with_indexer(indexer, value, self.name) - - def _validate_key(self, key, axis: AxisInt): - """ - Ensure that key is valid for current indexer. - - Parameters - ---------- - key : scalar, slice or list-like - Key requested. - axis : int - Dimension on which the indexing is being made. - - Raises - ------ - TypeError - If the key (or some element of it) has wrong type. - IndexError - If the key (or some element of it) is out of bounds. - KeyError - If the key was not found. - """ - raise AbstractMethodError(self) - - @final - def _expand_ellipsis(self, tup: tuple) -> tuple: - """ - If a tuple key includes an Ellipsis, replace it with an appropriate - number of null slices. - """ - if any(x is Ellipsis for x in tup): - if tup.count(Ellipsis) > 1: - raise IndexingError(_one_ellipsis_message) - - if len(tup) == self.ndim: - # It is unambiguous what axis this Ellipsis is indexing, - # treat as a single null slice. - i = tup.index(Ellipsis) - # FIXME: this assumes only one Ellipsis - new_key = tup[:i] + (_NS,) + tup[i + 1 :] - return new_key - - # TODO: other cases? only one test gets here, and that is covered - # by _validate_key_length - return tup - - @final - def _validate_tuple_indexer(self, key: tuple) -> tuple: - """ - Check the key for valid keys across my indexer. - """ - key = self._validate_key_length(key) - key = self._expand_ellipsis(key) - for i, k in enumerate(key): - try: - self._validate_key(k, i) - except ValueError as err: - raise ValueError( - "Location based indexing can only have " - f"[{self._valid_types}] types" - ) from err - return key - - @final - def _is_nested_tuple_indexer(self, tup: tuple) -> bool: - """ - Returns - ------- - bool - """ - if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): - return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) - return False - - @final - def _convert_tuple(self, key: tuple) -> tuple: - # Note: we assume _tupleize_axis_indexer has been called, if necessary. - self._validate_key_length(key) - keyidx = [self._convert_to_indexer(k, axis=i) for i, k in enumerate(key)] - return tuple(keyidx) - - @final - def _validate_key_length(self, key: tuple) -> tuple: - if len(key) > self.ndim: - if key[0] is Ellipsis: - # e.g. Series.iloc[..., 3] reduces to just Series.iloc[3] - key = key[1:] - if Ellipsis in key: - raise IndexingError(_one_ellipsis_message) - return self._validate_key_length(key) - raise IndexingError("Too many indexers") - return key - - @final - def _getitem_tuple_same_dim(self, tup: tuple): - """ - Index with indexers that should return an object of the same dimension - as self.obj. - - This is only called after a failed call to _getitem_lowerdim. - """ - retval = self.obj - # Selecting columns before rows is signficiantly faster - start_val = (self.ndim - len(tup)) + 1 - for i, key in enumerate(reversed(tup)): - i = self.ndim - i - start_val - if com.is_null_slice(key): - continue - - retval = getattr(retval, self.name)._getitem_axis(key, axis=i) - # We should never have retval.ndim < self.ndim, as that should - # be handled by the _getitem_lowerdim call above. - assert retval.ndim == self.ndim - - if retval is self.obj: - # if all axes were a null slice (`df.loc[:, :]`), ensure we still - # return a new object (https://github.com/pandas-dev/pandas/pull/49469) - retval = retval.copy(deep=False) - - return retval - - @final - def _getitem_lowerdim(self, tup: tuple): - # we can directly get the axis result since the axis is specified - if self.axis is not None: - axis = self.obj._get_axis_number(self.axis) - return self._getitem_axis(tup, axis=axis) - - # we may have a nested tuples indexer here - if self._is_nested_tuple_indexer(tup): - return self._getitem_nested_tuple(tup) - - # we maybe be using a tuple to represent multiple dimensions here - ax0 = self.obj._get_axis(0) - # ...but iloc should handle the tuple as simple integer-location - # instead of checking it as multiindex representation (GH 13797) - if ( - isinstance(ax0, MultiIndex) - and self.name != "iloc" - and not any(isinstance(x, slice) for x in tup) - ): - # Note: in all extant test cases, replacing the slice condition with - # `all(is_hashable(x) or com.is_null_slice(x) for x in tup)` - # is equivalent. - # (see the other place where we call _handle_lowerdim_multi_index_axis0) - with suppress(IndexingError): - return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0(tup) - - tup = self._validate_key_length(tup) - - for i, key in enumerate(tup): - if is_label_like(key): - # We don't need to check for tuples here because those are - # caught by the _is_nested_tuple_indexer check above. - section = self._getitem_axis(key, axis=i) - - # We should never have a scalar section here, because - # _getitem_lowerdim is only called after a check for - # is_scalar_access, which that would be. - if section.ndim == self.ndim: - # we're in the middle of slicing through a MultiIndex - # revise the key wrt to `section` by inserting an _NS - new_key = tup[:i] + (_NS,) + tup[i + 1 :] - - else: - # Note: the section.ndim == self.ndim check above - # rules out having DataFrame here, so we dont need to worry - # about transposing. - new_key = tup[:i] + tup[i + 1 :] - - if len(new_key) == 1: - new_key = new_key[0] - - # Slices should return views, but calling iloc/loc with a null - # slice returns a new object. - if com.is_null_slice(new_key): - return section - # This is an elided recursive call to iloc/loc - return getattr(section, self.name)[new_key] - - raise IndexingError("not applicable") - - @final - def _getitem_nested_tuple(self, tup: tuple): - # we have a nested tuple so have at least 1 multi-index level - # we should be able to match up the dimensionality here - - def _contains_slice(x: object) -> bool: - # Check if object is a slice or a tuple containing a slice - if isinstance(x, tuple): - return any(isinstance(v, slice) for v in x) - elif isinstance(x, slice): - return True - return False - - for key in tup: - check_dict_or_set_indexers(key) - - # we have too many indexers for our dim, but have at least 1 - # multi-index dimension, try to see if we have something like - # a tuple passed to a series with a multi-index - if len(tup) > self.ndim: - if self.name != "loc": - # This should never be reached, but let's be explicit about it - raise ValueError("Too many indices") # pragma: no cover - if all( - (is_hashable(x) and not _contains_slice(x)) or com.is_null_slice(x) - for x in tup - ): - # GH#10521 Series should reduce MultiIndex dimensions instead of - # DataFrame, IndexingError is not raised when slice(None,None,None) - # with one row. - with suppress(IndexingError): - return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0( - tup - ) - elif isinstance(self.obj, ABCSeries) and any( - isinstance(k, tuple) for k in tup - ): - # GH#35349 Raise if tuple in tuple for series - # Do this after the all-hashable-or-null-slice check so that - # we are only getting non-hashable tuples, in particular ones - # that themselves contain a slice entry - # See test_loc_series_getitem_too_many_dimensions - raise IndexingError("Too many indexers") - - # this is a series with a multi-index specified a tuple of - # selectors - axis = self.axis or 0 - return self._getitem_axis(tup, axis=axis) - - # handle the multi-axis by taking sections and reducing - # this is iterative - obj = self.obj - # GH#41369 Loop in reverse order ensures indexing along columns before rows - # which selects only necessary blocks which avoids dtype conversion if possible - axis = len(tup) - 1 - for key in tup[::-1]: - if com.is_null_slice(key): - axis -= 1 - continue - - obj = getattr(obj, self.name)._getitem_axis(key, axis=axis) - axis -= 1 - - # if we have a scalar, we are done - if is_scalar(obj) or not hasattr(obj, "ndim"): - break - - return obj - - def _convert_to_indexer(self, key, axis: AxisInt): - raise AbstractMethodError(self) - - @final - def __getitem__(self, key): - check_dict_or_set_indexers(key) - if type(key) is tuple: - key = tuple(list(x) if is_iterator(x) else x for x in key) - key = tuple(com.apply_if_callable(x, self.obj) for x in key) - if self._is_scalar_access(key): - return self.obj._get_value(*key, takeable=self._takeable) - return self._getitem_tuple(key) - else: - # we by definition only have the 0th axis - axis = self.axis or 0 - - maybe_callable = com.apply_if_callable(key, self.obj) - return self._getitem_axis(maybe_callable, axis=axis) - - def _is_scalar_access(self, key: tuple): - raise NotImplementedError() - - def _getitem_tuple(self, tup: tuple): - raise AbstractMethodError(self) - - def _getitem_axis(self, key, axis: AxisInt): - raise NotImplementedError() - - def _has_valid_setitem_indexer(self, indexer) -> bool: - raise AbstractMethodError(self) - - @final - def _getbool_axis(self, key, axis: AxisInt): - # caller is responsible for ensuring non-None axis - labels = self.obj._get_axis(axis) - key = check_bool_indexer(labels, key) - inds = key.nonzero()[0] - return self.obj._take_with_is_copy(inds, axis=axis) - - - @doc(IndexingMixin.loc) - class _LocIndexer(_LocationIndexer): - _takeable: bool = False - _valid_types = ( - "labels (MUST BE IN THE INDEX), slices of labels (BOTH " - "endpoints included! Can be slices of integers if the " - "index is integers), listlike of labels, boolean" - ) - - # ------------------------------------------------------------------- - # Key Checks - - @doc(_LocationIndexer._validate_key) - def _validate_key(self, key, axis: Axis): - # valid for a collection of labels (we check their presence later) - # slice of labels (where start-end in labels) - # slice of integers (only if in the labels) - # boolean not in slice and with boolean index - ax = self.obj._get_axis(axis) - if isinstance(key, bool) and not ( - is_bool_dtype(ax.dtype) - or ax.dtype.name == "boolean" - or isinstance(ax, MultiIndex) - and is_bool_dtype(ax.get_level_values(0).dtype) - ): - raise KeyError( - f"{key}: boolean label can not be used without a boolean index" - ) - - if isinstance(key, slice) and ( - isinstance(key.start, bool) or isinstance(key.stop, bool) - ): - raise TypeError(f"{key}: boolean values can not be used in a slice") - - def _has_valid_setitem_indexer(self, indexer) -> bool: - return True - - def _is_scalar_access(self, key: tuple) -> bool: - """ - Returns - ------- - bool - """ - # this is a shortcut accessor to both .loc and .iloc - # that provide the equivalent access of .at and .iat - # a) avoid getting things via sections and (to minimize dtype changes) - # b) provide a performant path - if len(key) != self.ndim: - return False - - for i, k in enumerate(key): - if not is_scalar(k): - return False - - ax = self.obj.axes[i] - if isinstance(ax, MultiIndex): - return False - - if isinstance(k, str) and ax._supports_partial_string_indexing: - # partial string indexing, df.loc['2000', 'A'] - # should not be considered scalar - return False - - if not ax._index_as_unique: - return False - - return True - - # ------------------------------------------------------------------- - # MultiIndex Handling - - def _multi_take_opportunity(self, tup: tuple) -> bool: - """ - Check whether there is the possibility to use ``_multi_take``. - - Currently the limit is that all axes being indexed, must be indexed with - list-likes. - - Parameters - ---------- - tup : tuple - Tuple of indexers, one per axis. - - Returns - ------- - bool - Whether the current indexing, - can be passed through `_multi_take`. - """ - if not all(is_list_like_indexer(x) for x in tup): - return False - - # just too complicated - return not any(com.is_bool_indexer(x) for x in tup) - - def _multi_take(self, tup: tuple): - """ - Create the indexers for the passed tuple of keys, and - executes the take operation. This allows the take operation to be - executed all at once, rather than once for each dimension. - Improving efficiency. - - Parameters - ---------- - tup : tuple - Tuple of indexers, one per axis. - - Returns - ------- - values: same type as the object being indexed - """ - # GH 836 - d = { - axis: self._get_listlike_indexer(key, axis) - for (key, axis) in zip(tup, self.obj._AXIS_ORDERS) - } - return self.obj._reindex_with_indexers(d, copy=True, allow_dups=True) - - # ------------------------------------------------------------------- - - def _getitem_iterable(self, key, axis: AxisInt): - """ - Index current object with an iterable collection of keys. - - Parameters - ---------- - key : iterable - Targeted labels. - axis : int - Dimension on which the indexing is being made. - - Raises - ------ - KeyError - If no key was found. Will change in the future to raise if not all - keys were found. - - Returns - ------- - scalar, DataFrame, or Series: indexed value(s). - """ - # we assume that not com.is_bool_indexer(key), as that is - # handled before we get here. - self._validate_key(key, axis) - - # A collection of keys - keyarr, indexer = self._get_listlike_indexer(key, axis) - return self.obj._reindex_with_indexers( - {axis: [keyarr, indexer]}, copy=True, allow_dups=True - ) - - def _getitem_tuple(self, tup: tuple): - with suppress(IndexingError): - tup = self._expand_ellipsis(tup) - return self._getitem_lowerdim(tup) - - # no multi-index, so validate all of the indexers - tup = self._validate_tuple_indexer(tup) - - # ugly hack for GH #836 - if self._multi_take_opportunity(tup): - return self._multi_take(tup) - - return self._getitem_tuple_same_dim(tup) - - def _get_label(self, label, axis: AxisInt): - # GH#5567 this will fail if the label is not present in the axis. - return self.obj.xs(label, axis=axis) - - def _handle_lowerdim_multi_index_axis0(self, tup: tuple): - # we have an axis0 multi-index, handle or raise - axis = self.axis or 0 - try: - # fast path for series or for tup devoid of slices - return self._get_label(tup, axis=axis) - - except KeyError as ek: - # raise KeyError if number of indexers match - # else IndexingError will be raised - if self.ndim < len(tup) <= self.obj.index.nlevels: - raise ek - raise IndexingError("No label returned") from ek - - def _getitem_axis(self, key, axis: AxisInt): - key = item_from_zerodim(key) - if is_iterator(key): - key = list(key) - if key is Ellipsis: - key = slice(None) - - labels = self.obj._get_axis(axis) - - if isinstance(key, tuple) and isinstance(labels, MultiIndex): - key = tuple(key) - - if isinstance(key, slice): - self._validate_key(key, axis) - return self._get_slice_axis(key, axis=axis) - elif com.is_bool_indexer(key): - return self._getbool_axis(key, axis=axis) - elif is_list_like_indexer(key): - # an iterable multi-selection - if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): - if hasattr(key, "ndim") and key.ndim > 1: - raise ValueError("Cannot index with multidimensional key") - - return self._getitem_iterable(key, axis=axis) - - # nested tuple slicing - if is_nested_tuple(key, labels): - locs = labels.get_locs(key) - indexer = [slice(None)] * self.ndim - indexer[axis] = locs - return self.obj.iloc[tuple(indexer)] - - # fall thru to straight lookup - self._validate_key(key, axis) - return self._get_label(key, axis=axis) - - def _get_slice_axis(self, slice_obj: slice, axis: AxisInt): - """ - This is pretty simple as we just have to deal with labels. - """ - # caller is responsible for ensuring non-None axis - obj = self.obj - if not need_slice(slice_obj): - return obj.copy(deep=False) - - labels = obj._get_axis(axis) - indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step) - - if isinstance(indexer, slice): - return self.obj._slice(indexer, axis=axis) - else: - # DatetimeIndex overrides Index.slice_indexer and may - # return a DatetimeIndex instead of a slice object. - return self.obj.take(indexer, axis=axis) - - def _convert_to_indexer(self, key, axis: AxisInt): - """ - Convert indexing key into something we can use to do actual fancy - indexing on a ndarray. - - Examples - ix[:5] -> slice(0, 5) - ix[[1,2,3]] -> [1,2,3] - ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) - - Going by Zen of Python? - 'In the face of ambiguity, refuse the temptation to guess.' - raise AmbiguousIndexError with integer labels? - - No, prefer label-based indexing - """ - labels = self.obj._get_axis(axis) - - if isinstance(key, slice): - return labels._convert_slice_indexer(key, kind="loc") - - if ( - isinstance(key, tuple) - and not isinstance(labels, MultiIndex) - and self.ndim < 2 - and len(key) > 1 - ): - raise IndexingError("Too many indexers") - - # Slices are not valid keys passed in by the user, - # even though they are hashable in Python 3.12 - contains_slice = False - if isinstance(key, tuple): - contains_slice = any(isinstance(v, slice) for v in key) - - if is_scalar(key) or ( - isinstance(labels, MultiIndex) and is_hashable(key) and not contains_slice - ): - # Otherwise get_loc will raise InvalidIndexError - - # if we are a label return me - try: - return labels.get_loc(key) - except LookupError: - if isinstance(key, tuple) and isinstance(labels, MultiIndex): - if len(key) == labels.nlevels: - return {"key": key} - raise - except InvalidIndexError: - # GH35015, using datetime as column indices raises exception - if not isinstance(labels, MultiIndex): - raise - except ValueError: - if not is_integer(key): - raise - return {"key": key} - - if is_nested_tuple(key, labels): - if self.ndim == 1 and any(isinstance(k, tuple) for k in key): - # GH#35349 Raise if tuple in tuple for series - raise IndexingError("Too many indexers") - return labels.get_locs(key) - - elif is_list_like_indexer(key): - if is_iterator(key): - key = list(key) - - if com.is_bool_indexer(key): - key = check_bool_indexer(labels, key) - return key - else: - return self._get_listlike_indexer(key, axis)[1] - else: - try: - return labels.get_loc(key) - except LookupError: - # allow a not found key only if we are a setter - if not is_list_like_indexer(key): - return {"key": key} - raise - - def _get_listlike_indexer(self, key, axis: AxisInt): - """ - Transform a list-like of keys into a new index and an indexer. - - Parameters - ---------- - key : list-like - Targeted labels. - axis: int - Dimension on which the indexing is being made. - - Raises - ------ - KeyError - If at least one key was requested but none was found. - - Returns - ------- - keyarr: Index - New index (coinciding with 'key' if the axis is unique). - values : array-like - Indexer for the return object, -1 denotes keys not found. - """ - ax = self.obj._get_axis(axis) - axis_name = self.obj._get_axis_name(axis) - - keyarr, indexer = ax._get_indexer_strict(key, axis_name) - - return keyarr, indexer - - - @doc(IndexingMixin.iloc) - class _iLocIndexer(_LocationIndexer): - _valid_types = ( - "integer, integer slice (START point is INCLUDED, END " - "point is EXCLUDED), listlike of integers, boolean array" - ) - _takeable = True - - # ------------------------------------------------------------------- - # Key Checks - - def _validate_key(self, key, axis: AxisInt): - if com.is_bool_indexer(key): - if hasattr(key, "index") and isinstance(key.index, Index): - if key.index.inferred_type == "integer": - raise NotImplementedError( - "iLocation based boolean " - "indexing on an integer type " - "is not available" - ) - raise ValueError( - "iLocation based boolean indexing cannot use " - "an indexable as a mask" - ) - return - - if isinstance(key, slice): - return - elif is_integer(key): - self._validate_integer(key, axis) - elif isinstance(key, tuple): - # a tuple should already have been caught by this point - # so don't treat a tuple as a valid indexer - raise IndexingError("Too many indexers") - elif is_list_like_indexer(key): - if isinstance(key, ABCSeries): - arr = key._values - elif is_array_like(key): - arr = key - else: - arr = np.array(key) - len_axis = len(self.obj._get_axis(axis)) - - # check that the key has a numeric dtype - if not is_numeric_dtype(arr.dtype): - raise IndexError(f".iloc requires numeric indexers, got {arr}") - - # check that the key does not exceed the maximum size of the index - if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis): - raise IndexError("positional indexers are out-of-bounds") - else: - raise ValueError(f"Can only index by location with a [{self._valid_types}]") - - def _has_valid_setitem_indexer(self, indexer) -> bool: - """ - Validate that a positional indexer cannot enlarge its target - will raise if needed, does not modify the indexer externally. - - Returns - ------- - bool - """ - if isinstance(indexer, dict): - raise IndexError("iloc cannot enlarge its target object") - - if isinstance(indexer, ABCDataFrame): - raise TypeError( - "DataFrame indexer for .iloc is not supported. " - "Consider using .loc with a DataFrame indexer for automatic alignment.", - ) - - if not isinstance(indexer, tuple): - indexer = _tuplify(self.ndim, indexer) - - for ax, i in zip(self.obj.axes, indexer): - if isinstance(i, slice): - # should check the stop slice? - pass - elif is_list_like_indexer(i): - # should check the elements? - pass - elif is_integer(i): - if i >= len(ax): - raise IndexError("iloc cannot enlarge its target object") - elif isinstance(i, dict): - raise IndexError("iloc cannot enlarge its target object") - - return True - - def _is_scalar_access(self, key: tuple) -> bool: - """ - Returns - ------- - bool - """ - # this is a shortcut accessor to both .loc and .iloc - # that provide the equivalent access of .at and .iat - # a) avoid getting things via sections and (to minimize dtype changes) - # b) provide a performant path - if len(key) != self.ndim: - return False - - return all(is_integer(k) for k in key) - - def _validate_integer(self, key: int | np.integer, axis: AxisInt) -> None: - """ - Check that 'key' is a valid position in the desired axis. - - Parameters - ---------- - key : int - Requested position. - axis : int - Desired axis. - - Raises - ------ - IndexError - If 'key' is not a valid position in axis 'axis'. - """ - len_axis = len(self.obj._get_axis(axis)) - if key >= len_axis or key < -len_axis: - raise IndexError("single positional indexer is out-of-bounds") - - # ------------------------------------------------------------------- - - def _getitem_tuple(self, tup: tuple): - tup = self._validate_tuple_indexer(tup) - with suppress(IndexingError): - return self._getitem_lowerdim(tup) - - return self._getitem_tuple_same_dim(tup) - - def _get_list_axis(self, key, axis: AxisInt): - """ - Return Series values by list or array of integers. - - Parameters - ---------- - key : list-like positional indexer - axis : int - - Returns - ------- - Series object - - Notes - ----- - `axis` can only be zero. - """ - try: - return self.obj._take_with_is_copy(key, axis=axis) - except IndexError as err: - # re-raise with different error message, e.g. test_getitem_ndarray_3d - raise IndexError("positional indexers are out-of-bounds") from err - - def _getitem_axis(self, key, axis: AxisInt): - if key is Ellipsis: - key = slice(None) - elif isinstance(key, ABCDataFrame): - raise IndexError( - "DataFrame indexer is not allowed for .iloc\n" - "Consider using .loc for automatic alignment." - ) - - if isinstance(key, slice): - return self._get_slice_axis(key, axis=axis) - - if is_iterator(key): - key = list(key) - - if isinstance(key, list): - key = np.asarray(key) - - if com.is_bool_indexer(key): - self._validate_key(key, axis) - return self._getbool_axis(key, axis=axis) - - # a list of integers - elif is_list_like_indexer(key): - return self._get_list_axis(key, axis=axis) - - # a single integer - else: - key = item_from_zerodim(key) - if not is_integer(key): - raise TypeError("Cannot index by location index with a non-integer key") - - # validate the location - self._validate_integer(key, axis) - - return self.obj._ixs(key, axis=axis) - - def _get_slice_axis(self, slice_obj: slice, axis: AxisInt): - # caller is responsible for ensuring non-None axis - obj = self.obj - - if not need_slice(slice_obj): - return obj.copy(deep=False) - - labels = obj._get_axis(axis) - labels._validate_positional_slice(slice_obj) - return self.obj._slice(slice_obj, axis=axis) - - def _convert_to_indexer(self, key, axis: AxisInt): - """ - Much simpler as we only have to deal with our valid types. - """ - return key - - def _get_setitem_indexer(self, key): - # GH#32257 Fall through to let numpy do validation - if is_iterator(key): - key = list(key) - - if self.axis is not None: - key = _tupleize_axis_indexer(self.ndim, self.axis, key) - - return key - - # ------------------------------------------------------------------- - - def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): - """ - _setitem_with_indexer is for setting values on a Series/DataFrame - using positional indexers. - - If the relevant keys are not present, the Series/DataFrame may be - expanded. - - This method is currently broken when dealing with non-unique Indexes, - since it goes from positional indexers back to labels when calling - BlockManager methods, see GH#12991, GH#22046, GH#15686. - """ - info_axis = self.obj._info_axis_number - - # maybe partial set - take_split_path = not self.obj._mgr.is_single_block - - if not take_split_path and isinstance(value, ABCDataFrame): - # Avoid cast of values - take_split_path = not value._mgr.is_single_block - - # if there is only one block/type, still have to take split path - # unless the block is one-dimensional or it can hold the value - if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1: - # in case of dict, keys are indices - val = list(value.values()) if isinstance(value, dict) else value - arr = self.obj._mgr.arrays[0] - take_split_path = not can_hold_element( - arr, extract_array(val, extract_numpy=True) - ) - - # if we have any multi-indexes that have non-trivial slices - # (not null slices) then we must take the split path, xref - # GH 10360, GH 27841 - if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): - for i, ax in zip(indexer, self.obj.axes): - if isinstance(ax, MultiIndex) and not ( - is_integer(i) or com.is_null_slice(i) - ): - take_split_path = True - break - - if isinstance(indexer, tuple): - nindexer = [] - for i, idx in enumerate(indexer): - if isinstance(idx, dict): - # reindex the axis to the new value - # and set inplace - key, _ = convert_missing_indexer(idx) - - # if this is the items axes, then take the main missing - # path first - # this correctly sets the dtype and avoids cache issues - # essentially this separates out the block that is needed - # to possibly be modified - if self.ndim > 1 and i == info_axis: - # add the new item, and set the value - # must have all defined axes if we have a scalar - # or a list-like on the non-info axes if we have a - # list-like - if not len(self.obj): - if not is_list_like_indexer(value): - raise ValueError( - "cannot set a frame with no " - "defined index and a scalar" - ) - self.obj[key] = value - return - - # add a new item with the dtype setup - if com.is_null_slice(indexer[0]): - # We are setting an entire column - self.obj[key] = value - return - elif is_array_like(value): - # GH#42099 - arr = extract_array(value, extract_numpy=True) - taker = -1 * np.ones(len(self.obj), dtype=np.intp) - empty_value = algos.take_nd(arr, taker) - if not isinstance(value, ABCSeries): - # if not Series (in which case we need to align), - # we can short-circuit - if ( - isinstance(arr, np.ndarray) - and arr.ndim == 1 - and len(arr) == 1 - ): - # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 - arr = arr[0, ...] - empty_value[indexer[0]] = arr - self.obj[key] = empty_value - return - - self.obj[key] = empty_value - - else: - # FIXME: GH#42099#issuecomment-864326014 - self.obj[key] = infer_fill_value(value) - - new_indexer = convert_from_missing_indexer_tuple( - indexer, self.obj.axes - ) - self._setitem_with_indexer(new_indexer, value, name) - - return - - # reindex the axis - # make sure to clear the cache because we are - # just replacing the block manager here - # so the object is the same - index = self.obj._get_axis(i) - labels = index.insert(len(index), key) - - # We are expanding the Series/DataFrame values to match - # the length of thenew index `labels`. GH#40096 ensure - # this is valid even if the index has duplicates. - taker = np.arange(len(index) + 1, dtype=np.intp) - taker[-1] = -1 - reindexers = {i: (labels, taker)} - new_obj = self.obj._reindex_with_indexers( - reindexers, allow_dups=True - ) - self.obj._mgr = new_obj._mgr - self.obj._maybe_update_cacher(clear=True) - self.obj._is_copy = None - - nindexer.append(labels.get_loc(key)) - - else: - nindexer.append(idx) - - indexer = tuple(nindexer) - else: - indexer, missing = convert_missing_indexer(indexer) - - if missing: - self._setitem_with_indexer_missing(indexer, value) - return - - if name == "loc": - # must come after setting of missing - indexer, value = self._maybe_mask_setitem_value(indexer, value) - - # align and set the values - if take_split_path: - # We have to operate column-wise - self._setitem_with_indexer_split_path(indexer, value, name) - else: - self._setitem_single_block(indexer, value, name) - - def _setitem_with_indexer_split_path(self, indexer, value, name: str): - """ - Setitem column-wise. - """ - # Above we only set take_split_path to True for 2D cases - assert self.ndim == 2 - - if not isinstance(indexer, tuple): - indexer = _tuplify(self.ndim, indexer) - if len(indexer) > self.ndim: - raise IndexError("too many indices for array") - if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: - raise ValueError(r"Cannot set values with ndim > 2") - - if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): - from pandas import Series - - value = self._align_series(indexer, Series(value)) - - # Ensure we have something we can iterate over - info_axis = indexer[1] - ilocs = self._ensure_iterable_column_indexer(info_axis) - - pi = indexer[0] - lplane_indexer = length_of_indexer(pi, self.obj.index) - # lplane_indexer gives the expected length of obj[indexer[0]] - - # we need an iterable, with a ndim of at least 1 - # eg. don't pass through np.array(0) - if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: - if isinstance(value, ABCDataFrame): - self._setitem_with_indexer_frame_value(indexer, value, name) - - elif np.ndim(value) == 2: - # TODO: avoid np.ndim call in case it isn't an ndarray, since - # that will construct an ndarray, which will be wasteful - self._setitem_with_indexer_2d_value(indexer, value) - - elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): - # We are setting multiple rows in a single column. - self._setitem_single_column(ilocs[0], value, pi) - - elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): - # We are trying to set N values into M entries of a single - # column, which is invalid for N != M - # Exclude zero-len for e.g. boolean masking that is all-false - - if len(value) == 1 and not is_integer(info_axis): - # This is a case like df.iloc[:3, [1]] = [0] - # where we treat as df.iloc[:3, 1] = 0 - return self._setitem_with_indexer((pi, info_axis[0]), value[0]) - - raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" - ) - - elif lplane_indexer == 0 and len(value) == len(self.obj.index): - # We get here in one case via .loc with a all-False mask - pass - - elif self._is_scalar_access(indexer) and is_object_dtype( - self.obj.dtypes._values[ilocs[0]] - ): - # We are setting nested data, only possible for object dtype data - self._setitem_single_column(indexer[1], value, pi) - - elif len(ilocs) == len(value): - # We are setting multiple columns in a single row. - for loc, v in zip(ilocs, value): - self._setitem_single_column(loc, v, pi) - - elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: - # This is a setitem-with-expansion, see - # test_loc_setitem_empty_append_expands_rows_mixed_dtype - # e.g. df = DataFrame(columns=["x", "y"]) - # df["x"] = df["x"].astype(np.int64) - # df.loc[:, "x"] = [1, 2, 3] - self._setitem_single_column(ilocs[0], value, pi) - - else: - raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" - ) - - else: - # scalar value - for loc in ilocs: - self._setitem_single_column(loc, value, pi) - - def _setitem_with_indexer_2d_value(self, indexer, value): - # We get here with np.ndim(value) == 2, excluding DataFrame, - # which goes through _setitem_with_indexer_frame_value - pi = indexer[0] - - ilocs = self._ensure_iterable_column_indexer(indexer[1]) - - if not is_array_like(value): - # cast lists to array - value = np.array(value, dtype=object) - if len(ilocs) != value.shape[1]: - raise ValueError( - "Must have equal len keys and value when setting with an ndarray" - ) - - for i, loc in enumerate(ilocs): - value_col = value[:, i] - if is_object_dtype(value_col.dtype): - # casting to list so that we do type inference in setitem_single_column - value_col = value_col.tolist() - self._setitem_single_column(loc, value_col, pi) - - def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): - ilocs = self._ensure_iterable_column_indexer(indexer[1]) - - sub_indexer = list(indexer) - pi = indexer[0] - - multiindex_indexer = isinstance(self.obj.columns, MultiIndex) - - unique_cols = value.columns.is_unique - - # We do not want to align the value in case of iloc GH#37728 - if name == "iloc": - for i, loc in enumerate(ilocs): - val = value.iloc[:, i] - self._setitem_single_column(loc, val, pi) - - elif not unique_cols and value.columns.equals(self.obj.columns): - # We assume we are already aligned, see - # test_iloc_setitem_frame_duplicate_columns_multiple_blocks - for loc in ilocs: - item = self.obj.columns[loc] - if item in value: - sub_indexer[1] = item - val = self._align_series( - tuple(sub_indexer), - value.iloc[:, loc], - multiindex_indexer, - ) - else: - val = np.nan - - self._setitem_single_column(loc, val, pi) - - elif not unique_cols: - raise ValueError("Setting with non-unique columns is not allowed.") - - else: - for loc in ilocs: - item = self.obj.columns[loc] - if item in value: - sub_indexer[1] = item - val = self._align_series( - tuple(sub_indexer), - value[item], - multiindex_indexer, - using_cow=using_copy_on_write(), - ) - else: - val = np.nan - - self._setitem_single_column(loc, val, pi) - - def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: - """ - - Parameters - ---------- - loc : int - Indexer for column position - plane_indexer : int, slice, listlike[int] - The indexer we use for setitem along axis=0. - """ - pi = plane_indexer - - is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)) - - is_null_setter = com.is_empty_slice(pi) or is_array_like(pi) and len(pi) == 0 - - if is_null_setter: - # no-op, don't cast dtype later - return - - elif is_full_setter: - try: - self.obj._mgr.column_setitem( - loc, plane_indexer, value, inplace_only=True - ) - except (ValueError, TypeError, LossySetitemError): - # If we're setting an entire column and we can't do it inplace, - # then we can use value's dtype (or inferred dtype) - # instead of object - self.obj.isetitem(loc, value) - else: - # set value into the column (first attempting to operate inplace, then - # falling back to casting if necessary) - self.obj._mgr.column_setitem(loc, plane_indexer, value) - - self.obj._clear_item_cache() - - def _setitem_single_block(self, indexer, value, name: str) -> None: - """ - _setitem_with_indexer for the case when we have a single Block. - """ - from pandas import Series - - info_axis = self.obj._info_axis_number - item_labels = self.obj._get_axis(info_axis) - if isinstance(indexer, tuple): - # if we are setting on the info axis ONLY - # set using those methods to avoid block-splitting - # logic here - if ( - self.ndim == len(indexer) == 2 - and is_integer(indexer[1]) - and com.is_null_slice(indexer[0]) - ): - col = item_labels[indexer[info_axis]] - if len(item_labels.get_indexer_for([col])) == 1: - # e.g. test_loc_setitem_empty_append_expands_rows - loc = item_labels.get_loc(col) - self._setitem_single_column(loc, value, indexer[0]) - return - - indexer = maybe_convert_ix(*indexer) # e.g. test_setitem_frame_align - - if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): - # TODO(EA): ExtensionBlock.setitem this causes issues with - # setting for extensionarrays that store dicts. Need to decide - # if it's worth supporting that. - value = self._align_series(indexer, Series(value)) - - elif isinstance(value, ABCDataFrame) and name != "iloc": - value = self._align_frame(indexer, value)._values - - # check for chained assignment - self.obj._check_is_chained_assignment_possible() - - # actually do the set - self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) - self.obj._maybe_update_cacher(clear=True, inplace=True) - - def _setitem_with_indexer_missing(self, indexer, value): - """ - Insert new row(s) or column(s) into the Series or DataFrame. - """ - from pandas import Series - - # reindex the axis to the new value - # and set inplace - if self.ndim == 1: - index = self.obj.index - new_index = index.insert(len(index), indexer) - - # we have a coerced indexer, e.g. a float - # that matches in an int64 Index, so - # we will not create a duplicate index, rather - # index to that element - # e.g. 0.0 -> 0 - # GH#12246 - if index.is_unique: - # pass new_index[-1:] instead if [new_index[-1]] - # so that we retain dtype - new_indexer = index.get_indexer(new_index[-1:]) - if (new_indexer != -1).any(): - # We get only here with loc, so can hard code - return self._setitem_with_indexer(new_indexer, value, "loc") - - # this preserves dtype of the value and of the object - if not is_scalar(value): - new_dtype = None - - elif is_valid_na_for_dtype(value, self.obj.dtype): - if not is_object_dtype(self.obj.dtype): - # Every NA value is suitable for object, no conversion needed - value = na_value_for_dtype(self.obj.dtype, compat=False) - - new_dtype = maybe_promote(self.obj.dtype, value)[0] - - elif isna(value): - new_dtype = None - elif not self.obj.empty and not is_object_dtype(self.obj.dtype): - # We should not cast, if we have object dtype because we can - # set timedeltas into object series - curr_dtype = self.obj.dtype - curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype) - new_dtype = maybe_promote(curr_dtype, value)[0] - else: - new_dtype = None - - new_values = Series([value], dtype=new_dtype)._values - - if len(self.obj._values): - # GH#22717 handle casting compatibility that np.concatenate - # does incorrectly - new_values = concat_compat([self.obj._values, new_values]) - self.obj._mgr = self.obj._constructor( - new_values, index=new_index, name=self.obj.name - )._mgr - self.obj._maybe_update_cacher(clear=True) - - elif self.ndim == 2: - if not len(self.obj.columns): - # no columns and scalar - raise ValueError("cannot set a frame with no defined columns") - - has_dtype = hasattr(value, "dtype") - if isinstance(value, ABCSeries): - # append a Series - value = value.reindex(index=self.obj.columns, copy=True) - value.name = indexer - elif isinstance(value, dict): - value = Series( - value, index=self.obj.columns, name=indexer, dtype=object - ) - else: - # a list-list - if is_list_like_indexer(value): - # must have conforming columns - if len(value) != len(self.obj.columns): - raise ValueError("cannot set a row with mismatched columns") - - value = Series(value, index=self.obj.columns, name=indexer) - - if not len(self.obj): - # We will ignore the existing dtypes instead of using - # internals.concat logic - df = value.to_frame().T - - idx = self.obj.index - if isinstance(idx, MultiIndex): - name = idx.names - else: - name = idx.name - - df.index = Index([indexer], name=name) - if not has_dtype: - # i.e. if we already had a Series or ndarray, keep that - # dtype. But if we had a list or dict, then do inference - df = df.infer_objects(copy=False) - self.obj._mgr = df._mgr - else: - self.obj._mgr = self.obj._append(value)._mgr - self.obj._maybe_update_cacher(clear=True) - - def _ensure_iterable_column_indexer(self, column_indexer): - """ - Ensure that our column indexer is something that can be iterated over. - """ - ilocs: Sequence[int | np.integer] | np.ndarray - if is_integer(column_indexer): - ilocs = [column_indexer] - elif isinstance(column_indexer, slice): - ilocs = np.arange(len(self.obj.columns))[column_indexer] - elif ( - isinstance(column_indexer, np.ndarray) and column_indexer.dtype.kind == "b" - ): - ilocs = np.arange(len(column_indexer))[column_indexer] - else: - ilocs = column_indexer - return ilocs - - def _align_series( - self, - indexer, - ser: Series, - multiindex_indexer: bool = False, - using_cow: bool = False, - ): - """ - Parameters - ---------- - indexer : tuple, slice, scalar - Indexer used to get the locations that will be set to `ser`. - ser : pd.Series - Values to assign to the locations specified by `indexer`. - multiindex_indexer : bool, optional - Defaults to False. Should be set to True if `indexer` was from - a `pd.MultiIndex`, to avoid unnecessary broadcasting. - - Returns - ------- - `np.array` of `ser` broadcast to the appropriate shape for assignment - to the locations selected by `indexer` - """ - if isinstance(indexer, (slice, np.ndarray, list, Index)): - indexer = (indexer,) - - if isinstance(indexer, tuple): - # flatten np.ndarray indexers - def ravel(i): - return i.ravel() if isinstance(i, np.ndarray) else i - - indexer = tuple(map(ravel, indexer)) - - aligners = [not com.is_null_slice(idx) for idx in indexer] - sum_aligners = sum(aligners) - single_aligner = sum_aligners == 1 - is_frame = self.ndim == 2 - obj = self.obj - - # are we a single alignable value on a non-primary - # dim (e.g. panel: 1,2, or frame: 0) ? - # hence need to align to a single axis dimension - # rather that find all valid dims - - # frame - if is_frame: - single_aligner = single_aligner and aligners[0] - - # we have a frame, with multiple indexers on both axes; and a - # series, so need to broadcast (see GH5206) - if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): - ser_values = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values - - # single indexer - if len(indexer) > 1 and not multiindex_indexer: - len_indexer = len(indexer[1]) - ser_values = ( - np.tile(ser_values, len_indexer).reshape(len_indexer, -1).T - ) - - return ser_values - - for i, idx in enumerate(indexer): - ax = obj.axes[i] - - # multiple aligners (or null slices) - if is_sequence(idx) or isinstance(idx, slice): - if single_aligner and com.is_null_slice(idx): - continue - new_ix = ax[idx] - if not is_list_like_indexer(new_ix): - new_ix = Index([new_ix]) - else: - new_ix = Index(new_ix) - if ser.index.equals(new_ix): - if using_cow: - return ser - return ser._values.copy() - - return ser.reindex(new_ix)._values - - # 2 dims - elif single_aligner: - # reindex along index - ax = self.obj.axes[1] - if ser.index.equals(ax) or not len(ax): - return ser._values.copy() - return ser.reindex(ax)._values - - elif is_integer(indexer) and self.ndim == 1: - if is_object_dtype(self.obj.dtype): - return ser - ax = self.obj._get_axis(0) - - if ser.index.equals(ax): - return ser._values.copy() - - return ser.reindex(ax)._values[indexer] - - elif is_integer(indexer): - ax = self.obj._get_axis(1) - - if ser.index.equals(ax): - return ser._values.copy() - - return ser.reindex(ax)._values - - raise ValueError("Incompatible indexer with Series") - - def _align_frame(self, indexer, df: DataFrame) -> DataFrame: - is_frame = self.ndim == 2 - - if isinstance(indexer, tuple): - idx, cols = None, None - sindexers = [] - for i, ix in enumerate(indexer): - ax = self.obj.axes[i] - if is_sequence(ix) or isinstance(ix, slice): - if isinstance(ix, np.ndarray): - ix = ix.ravel() - if idx is None: - idx = ax[ix] - elif cols is None: - cols = ax[ix] - else: - break - else: - sindexers.append(i) - - if idx is not None and cols is not None: - if df.index.equals(idx) and df.columns.equals(cols): - val = df.copy() - else: - val = df.reindex(idx, columns=cols) - return val - - elif (isinstance(indexer, slice) or is_list_like_indexer(indexer)) and is_frame: - ax = self.obj.index[indexer] - if df.index.equals(ax): - val = df.copy() - else: - # we have a multi-index and are trying to align - # with a particular, level GH3738 - if ( - isinstance(ax, MultiIndex) - and isinstance(df.index, MultiIndex) - and ax.nlevels != df.index.nlevels - ): - raise TypeError( - "cannot align on a multi-index with out " - "specifying the join levels" - ) - - val = df.reindex(index=ax) - return val - - raise ValueError("Incompatible indexer with DataFrame") - - - class _ScalarAccessIndexer(NDFrameIndexerBase): - """ - Access scalars quickly. - """ - - # sub-classes need to set _takeable - _takeable: bool - - def _convert_key(self, key): - raise AbstractMethodError(self) - - def __getitem__(self, key): - if not isinstance(key, tuple): - # we could have a convertible item here (e.g. Timestamp) - if not is_list_like_indexer(key): - key = (key,) - else: - raise ValueError("Invalid call for scalar access (getting)!") - - key = self._convert_key(key) - return self.obj._get_value(*key, takeable=self._takeable) - - def __setitem__(self, key, value) -> None: - if isinstance(key, tuple): - key = tuple(com.apply_if_callable(x, self.obj) for x in key) - else: - # scalar callable may return tuple - key = com.apply_if_callable(key, self.obj) - - if not isinstance(key, tuple): - key = _tuplify(self.ndim, key) - key = list(self._convert_key(key)) - if len(key) != self.ndim: - raise ValueError("Not enough indexers for scalar access (setting)!") - - self.obj._set_value(*key, value=value, takeable=self._takeable) - - - @doc(IndexingMixin.at) - class _AtIndexer(_ScalarAccessIndexer): - _takeable = False - - def _convert_key(self, key): - """ - Require they keys to be the same type as the index. (so we don't - fallback) - """ - # GH 26989 - # For series, unpacking key needs to result in the label. - # This is already the case for len(key) == 1; e.g. (1,) - if self.ndim == 1 and len(key) > 1: - key = (key,) - - return key - - @property - def _axes_are_unique(self) -> bool: - # Only relevant for self.ndim == 2 - assert self.ndim == 2 - return self.obj.index.is_unique and self.obj.columns.is_unique - - def __getitem__(self, key): - if self.ndim == 2 and not self._axes_are_unique: - # GH#33041 fall back to .loc - if not isinstance(key, tuple) or not all(is_scalar(x) for x in key): - raise ValueError("Invalid call for scalar access (getting)!") - return self.obj.loc[key] - - return super().__getitem__(key) - - def __setitem__(self, key, value) -> None: - if self.ndim == 2 and not self._axes_are_unique: - # GH#33041 fall back to .loc - if not isinstance(key, tuple) or not all(is_scalar(x) for x in key): - raise ValueError("Invalid call for scalar access (setting)!") - - self.obj.loc[key] = value - return - - return super().__setitem__(key, value) - - - @doc(IndexingMixin.iat) - class _iAtIndexer(_ScalarAccessIndexer): - _takeable = True - - def _convert_key(self, key): - """ - Require integer args. (and convert to label arguments) - """ - for i in key: - if not is_integer(i): - raise ValueError("iAt based indexing can only have integer indexers") - return key - - - def _tuplify(ndim: int, loc: Hashable) -> tuple[Hashable | slice, ...]: - """ - Given an indexer for the first dimension, create an equivalent tuple - for indexing over all dimensions. - - Parameters - ---------- - ndim : int - loc : object - - Returns - ------- - tuple - """ - _tup: list[Hashable | slice] - _tup = [slice(None, None) for _ in range(ndim)] - _tup[0] = loc - return tuple(_tup) - - - def _tupleize_axis_indexer(ndim: int, axis: AxisInt, key) -> tuple: - """ - If we have an axis, adapt the given key to be axis-independent. - """ - new_key = [slice(None)] * ndim - new_key[axis] = key - return tuple(new_key) - - - def check_bool_indexer(index: Index, key) -> np.ndarray: - """ - Check if key is a valid boolean indexer for an object with such index and - perform reindexing or conversion if needed. - - This function assumes that is_bool_indexer(key) == True. - - Parameters - ---------- - index : Index - Index of the object on which the indexing is done. - key : list-like - Boolean indexer to check. - - Returns - ------- - np.array - Resulting key. - - Raises - ------ - IndexError - If the key does not have the same length as index. - IndexingError - If the index of the key is unalignable to index. - """ - result = key - if isinstance(key, ABCSeries) and not key.index.equals(index): - indexer = result.index.get_indexer_for(index) - if -1 in indexer: -> raise IndexingError( - "Unalignable boolean Series provided as " - "indexer (index of the boolean Series and of " - "the indexed object do not match)." - ) -E pandas.errors.IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match). - -/home/vscode/.local/lib/python3.10/site-packages/pandas/core/indexing.py:2575: IndexingError ------------------------------- Captured log setup ------------------------------ -INFO distributed.http.proxy:proxy.py:85 To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy -INFO distributed.scheduler:scheduler.py:1711 State start -INFO distributed.scheduler:scheduler.py:4072 Scheduler at: tcp://127.0.0.1:45065 -INFO distributed.scheduler:scheduler.py:4087 dashboard at: http://127.0.0.1:8787/status -INFO distributed.scheduler:scheduler.py:7879 Registering Worker plugin shuffle -INFO distributed.nanny:nanny.py:368 Start Nanny at: 'tcp://127.0.0.1:33669' -INFO distributed.nanny:nanny.py:368 Start Nanny at: 'tcp://127.0.0.1:40145' -INFO distributed.nanny:nanny.py:368 Start Nanny at: 'tcp://127.0.0.1:33031' -INFO distributed.scheduler:scheduler.py:4424 Register worker -INFO distributed.scheduler:scheduler.py:5934 Starting worker compute stream, tcp://127.0.0.1:37897 -INFO distributed.core:core.py:1019 Starting established connection to tcp://127.0.0.1:53954 -INFO distributed.scheduler:scheduler.py:4424 Register worker -INFO distributed.scheduler:scheduler.py:5934 Starting worker compute stream, tcp://127.0.0.1:43287 -INFO distributed.core:core.py:1019 Starting established connection to tcp://127.0.0.1:53964 -INFO distributed.scheduler:scheduler.py:4424 Register worker -INFO distributed.scheduler:scheduler.py:5934 Starting worker compute stream, tcp://127.0.0.1:39929 -INFO distributed.core:core.py:1019 Starting established connection to tcp://127.0.0.1:53972 -INFO distributed.scheduler:scheduler.py:5691 Receive client connection: Client-f60972f6-2885-11ef-9469-00224879e9b9 -INFO distributed.core:core.py:1019 Starting established connection to tcp://127.0.0.1:53982 ------------------------------ Captured stderr call ----------------------------- -/home/vscode/.local/lib/python3.10/site-packages/dask/core.py:127: UserWarning: Boolean Series key will be reindexed to match DataFrame index. - return func(*(_execute_task(a, cache) for a in args)) -2024-06-12 06:35:33,039 - distributed.worker - WARNING - Compute Failed -Key: ('operation-fused-cast_df-54ec80aac981a3f77a9a8ded7f391d20', 0) -Function: execute_task -args: ((, {'operation-fused-cast_df-54ec80aac981a3f77a9a8ded7f391d20': ('cast_df-47756cff5a2ca85eca21dc71eb9af42b', 0), ('cast_df-47756cff5a2ca85eca21dc71eb9af42b', 0): (, , [('operation-91c288ed66d23ec6ec6c9eac97efd7e3', 0)], {'schema': a: double -c: double, 'use_extension_types': True, 'use_arrow_dtype': True, '_func': >, '_meta': Empty DataFrame -Columns: [a, c] -Index: []}), ('operation-91c288ed66d23ec6ec6c9eac97efd7e3', 0): (, ('reset_index-91a375bc8c1254295ada72ea3f88a352', 0), ['a', 'c']), ('reset_index-91a375bc8c1254295ada72ea3f88a352', 0): (, , [('getitem-46689d9bcc98f0ef115dccb4f41fbf9a', 0)], {'drop': False}), ('getitem-46689d9bcc98f0ef115dccb4f4 -kwargs: {} -Exception: "IndexingError('Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).')" - ----------------------------- Captured log teardown ----------------------------- -INFO distributed.scheduler:scheduler.py:5735 Remove client Client-f60972f6-2885-11ef-9469-00224879e9b9 -INFO distributed.core:core.py:1044 Received 'close-stream' from tcp://127.0.0.1:53982; closing. -INFO distributed.scheduler:scheduler.py:5735 Remove client Client-f60972f6-2885-11ef-9469-00224879e9b9 -INFO distributed.scheduler:scheduler.py:5727 Close client connection: Client-f60972f6-2885-11ef-9469-00224879e9b9 -INFO distributed.scheduler:scheduler.py:7322 Retire worker addresses (0, 1, 2) -INFO distributed.nanny:nanny.py:609 Closing Nanny at 'tcp://127.0.0.1:33669'. Reason: nanny-close -INFO distributed.nanny:nanny.py:854 Nanny asking worker to close. Reason: nanny-close -INFO distributed.nanny:nanny.py:609 Closing Nanny at 'tcp://127.0.0.1:40145'. Reason: nanny-close -INFO distributed.nanny:nanny.py:854 Nanny asking worker to close. Reason: nanny-close -INFO distributed.nanny:nanny.py:609 Closing Nanny at 'tcp://127.0.0.1:33031'. Reason: nanny-close -INFO distributed.nanny:nanny.py:854 Nanny asking worker to close. Reason: nanny-close -INFO distributed.core:core.py:1044 Received 'close-stream' from tcp://127.0.0.1:53972; closing. -INFO distributed.scheduler:scheduler.py:5209 Remove worker (stimulus_id='handle-worker-cleanup-1718174133.9382055') -INFO distributed.core:core.py:1044 Received 'close-stream' from tcp://127.0.0.1:53964; closing. -INFO distributed.core:core.py:1044 Received 'close-stream' from tcp://127.0.0.1:53954; closing. -INFO distributed.scheduler:scheduler.py:5209 Remove worker (stimulus_id='handle-worker-cleanup-1718174133.9406834') -INFO distributed.scheduler:scheduler.py:5209 Remove worker (stimulus_id='handle-worker-cleanup-1718174133.9411561') -INFO distributed.scheduler:scheduler.py:5336 Lost all workers -INFO distributed.scheduler:scheduler.py:4146 Scheduler closing due to unknown reason... -INFO distributed.scheduler:scheduler.py:4164 Scheduler closing all comms -=============================== warnings summary =============================== -../../usr/local/py-utils/venvs/pytest/lib/python3.10/site-packages/_pytest/config/__init__.py:1448 - /usr/local/py-utils/venvs/pytest/lib/python3.10/site-packages/_pytest/config/__init__.py:1448: PytestConfigWarning: Unknown config option: spark_options - - self._warn_or_fail_if_strict(f"Unknown config option: {key}\n") - -fugue_sql/__init__.py:8 - /workspaces/fugue/fugue_sql/__init__.py:8: UserWarning: fsql and FugueSQLWorkflow now should be imported directly from fugue, fugue_sql will be removed in 0.9.0 - warnings.warn( - -tests/fugue_dask/test_execution_engine.py::DaskExecutionEngineTests::test_select[dask] -tests/fugue_dask/test_execution_engine.py::DaskExecutionEngineTests::test_select[dask] - /home/vscode/.local/lib/python3.10/site-packages/dask_expr/_expr.py:496: UserWarning: Boolean Series key will be reindexed to match DataFrame index. - return self.operation(*args, **self._kwargs) - --- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html - ----------- coverage: platform linux, python 3.10.14-final-0 ---------- -Name Stmts Miss Cover Missing -------------------------------------------------------------------------------- -fugue/__init__.py 33 33 0% 2-90 -fugue/_utils/display.py 53 53 0% 1-86 -fugue/_utils/exception.py 37 37 0% 1-77 -fugue/_utils/interfaceless.py 25 25 0% 1-73 -fugue/_utils/io.py 176 176 0% 1-290 -fugue/_utils/misc.py 13 13 0% 1-26 -fugue/_utils/registry.py 6 6 0% 1-10 -fugue/api.py 5 5 0% 3-71 -fugue/bag/__init__.py 1 1 0% 2 -fugue/bag/array_bag.py 28 28 0% 1-39 -fugue/bag/bag.py 46 46 0% 1-107 -fugue/collections/partition.py 197 197 0% 1-469 -fugue/collections/sql.py 53 53 0% 1-151 -fugue/collections/yielded.py 26 26 0% 1-74 -fugue/column/__init__.py 3 3 0% 2-4 -fugue/column/expressions.py 282 282 0% 1-856 -fugue/column/functions.py 48 48 0% 1-370 -fugue/column/sql.py 252 252 0% 1-497 -fugue/constants.py 28 28 0% 1-92 -fugue/dataframe/__init__.py 10 10 0% 2-21 -fugue/dataframe/api.py 68 68 0% 1-340 -fugue/dataframe/array_dataframe.py 75 75 0% 1-128 -fugue/dataframe/arrow_dataframe.py 183 183 0% 1-342 -fugue/dataframe/dataframe.py 178 178 0% 1-505 -fugue/dataframe/dataframe_iterable_dataframe.py 106 106 0% 1-208 -fugue/dataframe/dataframes.py 49 49 0% 1-109 -fugue/dataframe/function_wrapper.py 274 274 0% 1-460 -fugue/dataframe/iterable_dataframe.py 80 80 0% 1-138 -fugue/dataframe/pandas_dataframe.py 186 186 0% 1-355 -fugue/dataframe/utils.py 113 113 0% 1-291 -fugue/dataset/__init__.py 2 2 0% 2-3 -fugue/dataset/api.py 31 31 0% 1-100 -fugue/dataset/dataset.py 36 36 0% 1-148 -fugue/dev.py 14 14 0% 7-47 -fugue/exceptions.py 17 17 0% 1-65 -fugue/execution/__init__.py 4 4 0% 2-13 -fugue/execution/api.py 128 128 0% 1-1219 -fugue/execution/execution_engine.py 296 296 0% 1-1360 -fugue/execution/factory.py 83 83 0% 1-577 -fugue/execution/native_execution_engine.py 175 175 0% 1-422 -fugue/extensions/__init__.py 5 5 0% 2-6 -fugue/extensions/_builtins/__init__.py 3 3 0% 2-10 -fugue/extensions/_builtins/creators.py 34 34 0% 1-54 -fugue/extensions/_builtins/outputters.py 111 111 0% 1-176 -fugue/extensions/_builtins/processors.py 228 228 0% 1-375 -fugue/extensions/_utils.py 71 71 0% 1-143 -fugue/extensions/context.py 54 54 0% 1-121 -fugue/extensions/creator/__init__.py 2 2 0% 2-8 -fugue/extensions/creator/convert.py 71 71 0% 1-228 -fugue/extensions/creator/creator.py 4 4 0% 1-7 -fugue/extensions/outputter/__init__.py 2 2 0% 2-8 -fugue/extensions/outputter/convert.py 73 73 0% 1-217 -fugue/extensions/outputter/outputter.py 4 4 0% 1-7 -fugue/extensions/processor/__init__.py 2 2 0% 2-8 -fugue/extensions/processor/convert.py 83 83 0% 1-253 -fugue/extensions/processor/processor.py 4 4 0% 1-7 -fugue/extensions/transformer/__init__.py 2 2 0% 2-14 -fugue/extensions/transformer/constants.py 1 1 0% 1 -fugue/extensions/transformer/convert.py 254 254 0% 1-688 -fugue/extensions/transformer/transformer.py 20 20 0% 1-210 -fugue/plugins.py 9 9 0% 3-43 -fugue/registry.py 9 9 0% 1-30 -fugue/rpc/__init__.py 1 1 0% 2 -fugue/rpc/base.py 121 121 0% 1-280 -fugue/rpc/flask.py 62 62 0% 1-131 -fugue/sql/_utils.py 37 37 0% 1-67 -fugue/sql/_visitors.py 534 534 0% 3-913 -fugue/sql/api.py 29 29 0% 1-287 -fugue/sql/workflow.py 47 47 0% 1-82 -fugue/test/__init__.py 2 2 0% 2-3 -fugue/test/pandas_tester.py 15 15 0% 1-23 -fugue/test/plugins.py 131 131 0% 1-393 -fugue/workflow/__init__.py 5 5 0% 3-7 -fugue/workflow/_checkpoint.py 87 87 0% 1-173 -fugue/workflow/_tasks.py 179 179 0% 1-347 -fugue/workflow/_workflow_context.py 47 47 0% 1-78 -fugue/workflow/api.py 72 72 0% 1-307 -fugue/workflow/input.py 1 1 0% 1 -fugue/workflow/module.py 93 93 0% 1-165 -fugue/workflow/workflow.py 421 421 0% 1-2302 -fugue_dask/_io.py 109 86 21% 24-33, 44-57, 61-66, 72-92, 96-97, 103-106, 114-144, 148-149, 153-156, 164-170 -fugue_dask/_utils.py 118 74 37% 39-41, 53-59, 78-90, 111-121, 125-130, 136-143, 149-169, 175-188, 192-216, 221, 226-227 -fugue_dask/dataframe.py 222 105 53% 66-67, 69-71, 74, 78-84, 92, 103, 107, 112, 117, 121, 125, 128-129, 132-133, 136-137, 140-142, 145, 154-155, 166-171, 176, 182-186, 191-193, 198, 203, 208-210, 240, 245, 250, 255, 260, 265, 270, 275, 280-283, 290-295, 302-305, 315-318, 325-329, 336-338, 345-350, 357-360, 364-366, 370, 379-392 -fugue_dask/execution_engine.py 251 149 41% 61, 91, 95, 106-184, 214, 218, 233, 242, 269-306, 309, 317-319, 328-345, 353-360, 368-378, 386-396, 399-400, 409-417, 420-437, 447-455, 465-526, 535, 549-565, 596, 601 -fugue_dask/registry.py 34 8 76% 29, 34, 50, 61-62, 65-67 -fugue_duckdb/_io.py 144 124 14% 22-26, 31-33, 42-57, 67-85, 88-98, 103-172, 175-187, 192-207 -fugue_duckdb/_utils.py 97 80 18% 38, 42-43, 47, 51-79, 83-96, 102-131, 140-158 -fugue_duckdb/dask.py 66 40 39% 33-34, 37, 40, 44, 47-58, 61-64, 67-69, 77-79, 91-102, 113-115, 120-128, 131-135 -fugue_duckdb/dataframe.py 130 66 49% 45-46, 50, 55, 58, 62, 65-68, 71, 74, 77, 80-88, 91-104, 107, 110, 113-116, 121, 126, 131, 136, 141-143, 148, 153, 158, 163, 168-170, 175-176, 181, 186, 193-196, 203-212, 219, 228, 235, 244, 248-250 -fugue_duckdb/execution_engine.py 272 212 22% 44, 47-51, 54, 64-77, 84-85, 93, 96-104, 107-113, 116-130, 148-166, 169-176, 179-180, 183, 187, 191, 195, 198, 201, 204, 207, 216, 225-231, 240-307, 310, 313-320, 338-342, 348-349, 358-372, 375-399, 409-423, 433-477, 486-487, 499-503, 506-510, 513-517, 523-554 -fugue_duckdb/registry.py 51 9 82% 34, 90, 106-107, 113-114, 117-119 -fugue_duckdb/tester.py 35 12 66% 24-25, 36-38, 43-49 -fugue_ibis/_utils.py 109 81 26% 61-62, 69-77, 81-98, 102-117, 125-126, 132-134, 137-138, 143-145, 148, 158-166, 184-211 -fugue_ibis/dataframe.py 143 87 39% 24-29, 39, 42, 45, 61, 65, 69, 73, 81, 84-87, 90, 93-94, 97-98, 101-106, 109-112, 118, 121, 124-127, 132-134, 139-142, 147-149, 154-157, 162-164, 167-181, 186, 191, 196-201, 206-209, 213-221, 225-227 -fugue_ibis/execution_engine.py 243 175 28% 42, 50-58, 65, 91-123, 126-129, 134-137, 142-145, 148-150, 159-177, 180-207, 217-269, 272, 282-287, 292, 298, 310, 313-314, 318, 329-330, 359-360, 368, 372, 376, 379, 383, 386, 389-391, 394-397, 400-402, 410-412, 421-423, 426-428, 433-435, 440-442, 445-447, 456-460, 463-465, 475-483, 499-507, 522 -fugue_polars/_utils.py 4 1 75% 6 -fugue_polars/polars_dataframe.py 147 74 50% 63-74, 79, 82, 86, 89-90, 93-94, 97, 100, 105-110, 113, 116, 119, 122-123, 126, 131, 136, 139, 144, 149, 154, 159-161, 166, 171, 176, 181, 186, 191, 196, 201-203, 208-214, 219-224, 229-232, 239-241, 248-249, 256-257, 264-265, 269-271 -fugue_polars/registry.py 45 21 53% 21, 27, 30-31, 37, 47-48, 57-70, 77, 81-86 -fugue_ray/_utils/cluster.py 9 4 56% 8-11, 15-16 -fugue_ray/_utils/dataframe.py 37 23 38% 15-16, 21-26, 30-38, 52, 58-99, 113-120 -fugue_ray/_utils/io.py 119 88 26% 27-34, 47-57, 69-84, 92, 100-107, 117, 123-128, 135-141, 146-203, 208-247, 250 -fugue_ray/dataframe.py 208 140 33% 55-107, 114, 117, 121, 124-131, 135, 139, 143, 146-147, 150-152, 159-162, 166-167, 170, 173, 176, 179-190, 193-205, 210, 215, 218, 223, 228-231, 236-248, 256, 261, 266, 271, 276-277, 289-296, 303, 308, 315-318, 325, 332-335, 342, 346-356 -fugue_ray/execution_engine.py 136 93 32% 35, 39, 50-59, 75-144, 154-201, 218-219, 222, 226, 229, 232-236, 239, 242-262, 265, 273-275, 279-281, 284-291, 300, 314-316, 328-331, 335-341, 344-351 -fugue_ray/registry.py 32 7 78% 24, 29, 46-47, 50-52 -fugue_ray/tester.py 14 2 86% 21-22 -fugue_spark/_utils/convert.py 133 99 26% 38-44, 50-55, 59-64, 70-106, 110-116, 120-135, 144-163, 169-170, 177-181, 194-211, 220-237, 241-246, 250-268, 272 -fugue_spark/_utils/io.py 96 77 20% 19-20, 33-43, 55-65, 71-79, 84-90, 95-129, 132-139 -fugue_spark/_utils/misc.py 11 2 82% 13, 25 -fugue_spark/_utils/partition.py 51 37 27% 17-23, 29-35, 41-78, 82, 100-117 -fugue_spark/dataframe.py 199 114 43% 67-77, 85, 93, 96, 100, 104, 107-110, 114, 118, 121-122, 125-128, 131-132, 135-136, 139, 142, 145-149, 152-155, 160, 165, 170, 175, 180-181, 187-192, 195-197, 202, 207, 212, 217, 222, 227, 232, 237, 242, 247, 254-257, 264-269, 276-279, 289-292, 299-301, 311-324, 331-333, 340-344, 348-354, 358-360, 364 -fugue_spark/execution_engine.py 326 243 25% 82, 86, 90, 93-98, 106, 111, 122-135, 146-190, 201-235, 246-334, 349-369, 372, 380-383, 387, 391, 395, 398, 401, 404-414, 442, 445-476, 479-481, 489-493, 502-516, 524-533, 538-548, 553-563, 566-567, 576-577, 580-597, 607-632, 642-698, 707, 721-723, 734-735, 738-747, 753-817 -fugue_spark/registry.py 74 25 66% 42-47, 52, 57-60, 90-91, 97-98, 104-105, 108-110, 119-120, 123-125, 131 -fugue_spark/tester.py 27 7 74% 32, 37-38, 75-78 -fugue_test/bag_suite.py 62 62 0% 4-84 -fugue_test/builtin_suite.py 1174 1174 0% 2-2018 -fugue_test/dataframe_suite.py 278 278 0% 3-462 -fugue_test/execution_suite.py 571 571 0% 2-1299 -------------------------------------------------------------------------------- -TOTAL 12184 10800 11% - -16 files skipped due to complete coverage. - -=========================== short test summary info ============================ -FAILED tests/fugue_dask/test_execution_engine.py::DaskExecutionEngineTests::test_select[dask] - pandas.errors.IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match). -======================== 1 failed, 4 warnings in 9.53s ========================= From a32be3106f08b29392588af20e5838af223479c2 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 13 Jun 2024 06:58:09 +0000 Subject: [PATCH 8/8] update --- RELEASE.md | 5 +++++ tests/fugue_dask/test_execution_engine.py | 1 + tests/fugue_ibis/test_execution_engine.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 0f46183b..ce136c76 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,5 +1,10 @@ # Release Notes +## 0.9.1 + +- [543](https://github.com/fugue-project/fugue/issues/543) Support type hinting with standard collections +- [544](https://github.com/fugue-project/fugue/issues/544) Fix Spark connect import issue on worker side + ## 0.9.0 - [482](https://github.com/fugue-project/fugue/issues/482) Move Fugue SQL dependencies into extra `[sql]` and functions to become soft dependencies diff --git a/tests/fugue_dask/test_execution_engine.py b/tests/fugue_dask/test_execution_engine.py index 60f1121f..11f52487 100644 --- a/tests/fugue_dask/test_execution_engine.py +++ b/tests/fugue_dask/test_execution_engine.py @@ -55,6 +55,7 @@ def test__join_outer_pandas_incompatible(self): # TODO: dask-sql 2024.5.0 has a bug, can't pass the HAVING tests def test_select(self): try: + import qpd import dask_sql except ImportError: return diff --git a/tests/fugue_ibis/test_execution_engine.py b/tests/fugue_ibis/test_execution_engine.py index 915e1895..2f9e7333 100644 --- a/tests/fugue_ibis/test_execution_engine.py +++ b/tests/fugue_ibis/test_execution_engine.py @@ -23,6 +23,9 @@ def test_properties(self): assert not self.engine.map_engine.is_distributed assert not self.engine.sql_engine.is_distributed + assert self.engine.sql_engine.get_temp_table_name( + ) != self.engine.sql_engine.get_temp_table_name() + def test_select(self): # it can't work properly with DuckDB (hugeint is not recognized) pass