Source code for pandas.core.frame
+"""
+DataFrame
+---------
+An efficient 2D container for potentially mixed-type time series or other
+labeled data series.
+
+Similar to its R counterpart, data.frame, except providing automatic data
+alignment and a host of useful data manipulation methods having to do with the
+labeling information
+"""
+import collections
+from collections import OrderedDict, abc
+import functools
+from io import StringIO
+import itertools
+import sys
+from textwrap import dedent
+from typing import FrozenSet, List, Optional, Set, Tuple, Type, Union
+import warnings
+
+import numpy as np
+import numpy.ma as ma
+
+from pandas._config import get_option
+
+from pandas._libs import algos as libalgos, lib
+from pandas.compat import PY36, raise_with_traceback
+from pandas.compat.numpy import function as nv
+from pandas.util._decorators import (
+ Appender,
+ Substitution,
+ deprecate_kwarg,
+ rewrite_axis_style_signature,
+)
+from pandas.util._validators import validate_axis_style_args, validate_bool_kwarg
+
+from pandas.core.dtypes.cast import (
+ cast_scalar_to_array,
+ coerce_to_dtypes,
+ find_common_type,
+ infer_dtype_from_scalar,
+ invalidate_string_dtypes,
+ maybe_cast_to_datetime,
+ maybe_convert_platform,
+ maybe_downcast_to_dtype,
+ maybe_infer_to_datetimelike,
+ maybe_upcast,
+ maybe_upcast_putmask,
+)
+from pandas.core.dtypes.common import (
+ ensure_float64,
+ ensure_int64,
+ ensure_platform_int,
+ infer_dtype_from_object,
+ is_bool_dtype,
+ is_datetime64_any_dtype,
+ is_datetime64tz_dtype,
+ is_dict_like,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_extension_type,
+ is_float_dtype,
+ is_hashable,
+ is_integer,
+ is_integer_dtype,
+ is_iterator,
+ is_list_like,
+ is_named_tuple,
+ is_nested_list_like,
+ is_object_dtype,
+ is_scalar,
+ is_sequence,
+ needs_i8_conversion,
+)
+from pandas.core.dtypes.generic import (
+ ABCDataFrame,
+ ABCIndexClass,
+ ABCMultiIndex,
+ ABCSeries,
+)
+from pandas.core.dtypes.missing import isna, notna
+
+from pandas.core import algorithms, common as com, nanops, ops
+from pandas.core.accessor import CachedAccessor
+from pandas.core.arrays import Categorical, ExtensionArray
+from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
+from pandas.core.arrays.sparse import SparseFrameAccessor
+from pandas.core.generic import NDFrame, _shared_docs
+from pandas.core.index import (
+ Index,
+ MultiIndex,
+ ensure_index,
+ ensure_index_from_sequences,
+)
+from pandas.core.indexes import base as ibase
+from pandas.core.indexes.datetimes import DatetimeIndex
+from pandas.core.indexes.period import PeriodIndex
+from pandas.core.indexing import (
+ check_bool_indexer,
+ convert_to_index_sliceable,
+ maybe_droplevels,
+)
+from pandas.core.internals import BlockManager
+from pandas.core.internals.construction import (
+ arrays_to_mgr,
+ get_names_from_index,
+ init_dict,
+ init_ndarray,
+ masked_rec_array_to_mgr,
+ reorder_arrays,
+ sanitize_index,
+ to_arrays,
+)
+from pandas.core.series import Series
+
+from pandas.io.formats import console, format as fmt
+from pandas.io.formats.printing import pprint_thing
+import pandas.plotting
+
+# ---------------------------------------------------------------------
+# Docstring templates
+
+_shared_doc_kwargs = dict(
+ axes="index, columns",
+ klass="DataFrame",
+ axes_single_arg="{0 or 'index', 1 or 'columns'}",
+ axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
+ If 0 or 'index': apply function to each column.
+ If 1 or 'columns': apply function to each row.""",
+ optional_by="""
+ by : str or list of str
+ Name or list of names to sort by.
+
+ - if `axis` is 0 or `'index'` then `by` may contain index
+ levels and/or column labels
+ - if `axis` is 1 or `'columns'` then `by` may contain column
+ levels and/or index labels
+
+ .. versionchanged:: 0.23.0
+ Allow specifying index or column level names.""",
+ versionadded_to_excel="",
+ optional_labels="""labels : array-like, optional
+ New labels / index to conform the axis specified by 'axis' to.""",
+ optional_axis="""axis : int or str, optional
+ Axis to target. Can be either the axis name ('index', 'columns')
+ or number (0, 1).""",
+)
+
+_numeric_only_doc = """numeric_only : boolean, default None
+ Include only float, int, boolean data. If None, will attempt to use
+ everything, then use only numeric data
+"""
+
+_merge_doc = """
+Merge DataFrame or named Series objects with a database-style join.
+
+The join is done on columns or indexes. If joining columns on
+columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
+on indexes or indexes on a column or columns, the index will be passed on.
+
+Parameters
+----------%s
+right : DataFrame or named Series
+ Object to merge with.
+how : {'left', 'right', 'outer', 'inner'}, default 'inner'
+ Type of merge to be performed.
+
+ * left: use only keys from left frame, similar to a SQL left outer join;
+ preserve key order.
+ * right: use only keys from right frame, similar to a SQL right outer join;
+ preserve key order.
+ * outer: use union of keys from both frames, similar to a SQL full outer
+ join; sort keys lexicographically.
+ * inner: use intersection of keys from both frames, similar to a SQL inner
+ join; preserve the order of the left keys.
+on : label or list
+ Column or index level names to join on. These must be found in both
+ DataFrames. If `on` is None and not merging on indexes then this defaults
+ to the intersection of the columns in both DataFrames.
+left_on : label or list, or array-like
+ Column or index level names to join on in the left DataFrame. Can also
+ be an array or list of arrays of the length of the left DataFrame.
+ These arrays are treated as if they are columns.
+right_on : label or list, or array-like
+ Column or index level names to join on in the right DataFrame. Can also
+ be an array or list of arrays of the length of the right DataFrame.
+ These arrays are treated as if they are columns.
+left_index : bool, default False
+ Use the index from the left DataFrame as the join key(s). If it is a
+ MultiIndex, the number of keys in the other DataFrame (either the index
+ or a number of columns) must match the number of levels.
+right_index : bool, default False
+ Use the index from the right DataFrame as the join key. Same caveats as
+ left_index.
+sort : bool, default False
+ Sort the join keys lexicographically in the result DataFrame. If False,
+ the order of the join keys depends on the join type (how keyword).
+suffixes : tuple of (str, str), default ('_x', '_y')
+ Suffix to apply to overlapping column names in the left and right
+ side, respectively. To raise an exception on overlapping columns use
+ (False, False).
+copy : bool, default True
+ If False, avoid copy if possible.
+indicator : bool or str, default False
+ If True, adds a column to output DataFrame called "_merge" with
+ information on the source of each row.
+ If string, column with information on source of each row will be added to
+ output DataFrame, and column will be named value of string.
+ Information column is Categorical-type and takes on a value of "left_only"
+ for observations whose merge key only appears in 'left' DataFrame,
+ "right_only" for observations whose merge key only appears in 'right'
+ DataFrame, and "both" if the observation's merge key is found in both.
+
+validate : str, optional
+ If specified, checks if merge is of specified type.
+
+ * "one_to_one" or "1:1": check if merge keys are unique in both
+ left and right datasets.
+ * "one_to_many" or "1:m": check if merge keys are unique in left
+ dataset.
+ * "many_to_one" or "m:1": check if merge keys are unique in right
+ dataset.
+ * "many_to_many" or "m:m": allowed, but does not result in checks.
+
+ .. versionadded:: 0.21.0
+
+Returns
+-------
+DataFrame
+ A DataFrame of the two merged objects.
+
+See Also
+--------
+merge_ordered : Merge with optional filling/interpolation.
+merge_asof : Merge on nearest keys.
+DataFrame.join : Similar method using indices.
+
+Notes
+-----
+Support for specifying index levels as the `on`, `left_on`, and
+`right_on` parameters was added in version 0.23.0
+Support for merging named Series objects was added in version 0.24.0
+
+Examples
+--------
+
+>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
+... 'value': [1, 2, 3, 5]})
+>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
+... 'value': [5, 6, 7, 8]})
+>>> df1
+ lkey value
+0 foo 1
+1 bar 2
+2 baz 3
+3 foo 5
+>>> df2
+ rkey value
+0 foo 5
+1 bar 6
+2 baz 7
+3 foo 8
+
+Merge df1 and df2 on the lkey and rkey columns. The value columns have
+the default suffixes, _x and _y, appended.
+
+>>> df1.merge(df2, left_on='lkey', right_on='rkey')
+ lkey value_x rkey value_y
+0 foo 1 foo 5
+1 foo 1 foo 8
+2 foo 5 foo 5
+3 foo 5 foo 8
+4 bar 2 bar 6
+5 baz 3 baz 7
+
+Merge DataFrames df1 and df2 with specified left and right suffixes
+appended to any overlapping columns.
+
+>>> df1.merge(df2, left_on='lkey', right_on='rkey',
+... suffixes=('_left', '_right'))
+ lkey value_left rkey value_right
+0 foo 1 foo 5
+1 foo 1 foo 8
+2 foo 5 foo 5
+3 foo 5 foo 8
+4 bar 2 bar 6
+5 baz 3 baz 7
+
+Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
+any overlapping columns.
+
+>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
+Traceback (most recent call last):
+...
+ValueError: columns overlap but no suffix specified:
+ Index(['value'], dtype='object')
+"""
+
+
+# -----------------------------------------------------------------------
+# DataFrame class
+
+
+class DataFrame(NDFrame):
+ """
+ Two-dimensional size-mutable, potentially heterogeneous tabular data
+ structure with labeled axes (rows and columns). Arithmetic operations
+ align on both row and column labels. Can be thought of as a dict-like
+ container for Series objects. The primary pandas data structure.
+
+ Parameters
+ ----------
+ data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
+ Dict can contain Series, arrays, constants, or list-like objects
+
+ .. versionchanged :: 0.23.0
+ If data is a dict, column order follows insertion-order for
+ Python 3.6 and later.
+
+ .. versionchanged :: 0.25.0
+ If data is a list of dicts, column order follows insertion-order
+ Python 3.6 and later.
+
+ index : Index or array-like
+ Index to use for resulting frame. Will default to RangeIndex if
+ no indexing information part of input data and no index provided
+ columns : Index or array-like
+ Column labels to use for resulting frame. Will default to
+ RangeIndex (0, 1, 2, ..., n) if no column labels are provided
+ dtype : dtype, default None
+ Data type to force. Only a single dtype is allowed. If None, infer
+ copy : boolean, default False
+ Copy data from inputs. Only affects DataFrame / 2d ndarray input
+
+ See Also
+ --------
+ DataFrame.from_records : Constructor from tuples, also record arrays.
+ DataFrame.from_dict : From dicts of Series, arrays, or dicts.
+ DataFrame.from_items : From sequence of (key, value) pairs
+ read_csv, pandas.read_table, pandas.read_clipboard.
+
+ Examples
+ --------
+ Constructing DataFrame from a dictionary.
+
+ >>> d = {'col1': [1, 2], 'col2': [3, 4]}
+ >>> df = pd.DataFrame(data=d)
+ >>> df
+ col1 col2
+ 0 1 3
+ 1 2 4
+
+ Notice that the inferred dtype is int64.
+
+ >>> df.dtypes
+ col1 int64
+ col2 int64
+ dtype: object
+
+ To enforce a single dtype:
+
+ >>> df = pd.DataFrame(data=d, dtype=np.int8)
+ >>> df.dtypes
+ col1 int8
+ col2 int8
+ dtype: object
+
+ Constructing DataFrame from numpy ndarray:
+
+ >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
+ ... columns=['a', 'b', 'c'])
+ >>> df2
+ a b c
+ 0 1 2 3
+ 1 4 5 6
+ 2 7 8 9
+ """
+
+ @property
+ def _constructor(self):
+ return DataFrame
+
+ _constructor_sliced = Series # type: Type[Series]
+ _deprecations = NDFrame._deprecations | frozenset(
+ ["get_value", "set_value", "from_items"]
+ ) # type: FrozenSet[str]
+ _accessors = set() # type: Set[str]
+
+ @property
+ def _constructor_expanddim(self):
+ raise NotImplementedError("Not supported for DataFrames!")
+
+ # ----------------------------------------------------------------------
+ # Constructors
+
+ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False):
+ if data is None:
+ data = {}
+ if dtype is not None:
+ dtype = self._validate_dtype(dtype)
+
+ if isinstance(data, DataFrame):
+ data = data._data
+
+ if isinstance(data, BlockManager):
+ mgr = self._init_mgr(
+ data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
+ )
+ elif isinstance(data, dict):
+ mgr = init_dict(data, index, columns, dtype=dtype)
+ elif isinstance(data, ma.MaskedArray):
+ import numpy.ma.mrecords as mrecords
+
+ # masked recarray
+ if isinstance(data, mrecords.MaskedRecords):
+ mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
+
+ # a masked array
+ else:
+ mask = ma.getmaskarray(data)
+ if mask.any():
+ data, fill_value = maybe_upcast(data, copy=True)
+ data.soften_mask() # set hardmask False if it was True
+ data[mask] = fill_value
+ else:
+ data = data.copy()
+ mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+
+ elif isinstance(data, (np.ndarray, Series, Index)):
+ if data.dtype.names:
+ data_columns = list(data.dtype.names)
+ data = {k: data[k] for k in data_columns}
+ if columns is None:
+ columns = data_columns
+ mgr = init_dict(data, index, columns, dtype=dtype)
+ elif getattr(data, "name", None) is not None:
+ mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
+ else:
+ mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+
+ # For data is list-like, or Iterable (will consume into list)
+ elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
+ if not isinstance(data, abc.Sequence):
+ data = list(data)
+ if len(data) > 0:
+ if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
+ if is_named_tuple(data[0]) and columns is None:
+ columns = data[0]._fields
+ arrays, columns = to_arrays(data, columns, dtype=dtype)
+ columns = ensure_index(columns)
+
+ # set the index
+ if index is None:
+ if isinstance(data[0], Series):
+ index = get_names_from_index(data)
+ elif isinstance(data[0], Categorical):
+ index = ibase.default_index(len(data[0]))
+ else:
+ index = ibase.default_index(len(data))
+
+ mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
+ else:
+ mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+ else:
+ mgr = init_dict({}, index, columns, dtype=dtype)
+ else:
+ try:
+ arr = np.array(data, dtype=dtype, copy=copy)
+ except (ValueError, TypeError) as e:
+ exc = TypeError(
+ "DataFrame constructor called with "
+ "incompatible data and dtype: {e}".format(e=e)
+ )
+ raise_with_traceback(exc)
+
+ if arr.ndim == 0 and index is not None and columns is not None:
+ values = cast_scalar_to_array(
+ (len(index), len(columns)), data, dtype=dtype
+ )
+ mgr = init_ndarray(
+ values, index, columns, dtype=values.dtype, copy=False
+ )
+ else:
+ raise ValueError("DataFrame constructor not properly called!")
+
+ NDFrame.__init__(self, mgr, fastpath=True)
+
+ # ----------------------------------------------------------------------
+
+ @property
+ def axes(self):
+ """
+ Return a list representing the axes of the DataFrame.
+
+ It has the row axis labels and column axis labels as the only members.
+ They are returned in that order.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+ >>> df.axes
+ [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
+ dtype='object')]
+ """
+ return [self.index, self.columns]
+
+ @property
+ def shape(self):
+ """
+ Return a tuple representing the dimensionality of the DataFrame.
+
+ See Also
+ --------
+ ndarray.shape
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+ >>> df.shape
+ (2, 2)
+
+ >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
+ ... 'col3': [5, 6]})
+ >>> df.shape
+ (2, 3)
+ """
+ return len(self.index), len(self.columns)
+
+ @property
+ def _is_homogeneous_type(self):
+ """
+ Whether all the columns in a DataFrame have the same type.
+
+ Returns
+ -------
+ bool
+
+ Examples
+ --------
+ >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
+ True
+ >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
+ False
+
+ Items with the same type but different sizes are considered
+ different types.
+
+ >>> DataFrame({
+ ... "A": np.array([1, 2], dtype=np.int32),
+ ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
+ False
+ """
+ if self._data.any_extension_types:
+ return len({block.dtype for block in self._data.blocks}) == 1
+ else:
+ return not self._data.is_mixed_type
+
+ # ----------------------------------------------------------------------
+ # Rendering Methods
+
+ def _repr_fits_vertical_(self):
+ """
+ Check length against max_rows.
+ """
+ max_rows = get_option("display.max_rows")
+ return len(self) <= max_rows
+
+ def _repr_fits_horizontal_(self, ignore_width=False):
+ """
+ Check if full repr fits in horizontal boundaries imposed by the display
+ options width and max_columns.
+
+ In case off non-interactive session, no boundaries apply.
+
+ `ignore_width` is here so ipnb+HTML output can behave the way
+ users expect. display.max_columns remains in effect.
+ GH3541, GH3573
+ """
+
+ width, height = console.get_console_size()
+ max_columns = get_option("display.max_columns")
+ nb_columns = len(self.columns)
+
+ # exceed max columns
+ if (max_columns and nb_columns > max_columns) or (
+ (not ignore_width) and width and nb_columns > (width // 2)
+ ):
+ return False
+
+ # used by repr_html under IPython notebook or scripts ignore terminal
+ # dims
+ if ignore_width or not console.in_interactive_session():
+ return True
+
+ if get_option("display.width") is not None or console.in_ipython_frontend():
+ # check at least the column row for excessive width
+ max_rows = 1
+ else:
+ max_rows = get_option("display.max_rows")
+
+ # when auto-detecting, so width=None and not in ipython front end
+ # check whether repr fits horizontal by actually checking
+ # the width of the rendered repr
+ buf = StringIO()
+
+ # only care about the stuff we'll actually print out
+ # and to_string on entire frame may be expensive
+ d = self
+
+ if not (max_rows is None): # unlimited rows
+ # min of two, where one may be None
+ d = d.iloc[: min(max_rows, len(d))]
+ else:
+ return True
+
+ d.to_string(buf=buf)
+ value = buf.getvalue()
+ repr_width = max(len(l) for l in value.split("\n"))
+
+ return repr_width < width
+
+ def _info_repr(self):
+ """
+ True if the repr should show the info view.
+ """
+ info_repr_option = get_option("display.large_repr") == "info"
+ return info_repr_option and not (
+ self._repr_fits_horizontal_() and self._repr_fits_vertical_()
+ )
+
+ def __repr__(self):
+ """
+ Return a string representation for a particular DataFrame.
+ """
+ buf = StringIO("")
+ if self._info_repr():
+ self.info(buf=buf)
+ return buf.getvalue()
+
+ max_rows = get_option("display.max_rows")
+ min_rows = get_option("display.min_rows")
+ max_cols = get_option("display.max_columns")
+ show_dimensions = get_option("display.show_dimensions")
+ if get_option("display.expand_frame_repr"):
+ width, _ = console.get_console_size()
+ else:
+ width = None
+ self.to_string(
+ buf=buf,
+ max_rows=max_rows,
+ min_rows=min_rows,
+ max_cols=max_cols,
+ line_width=width,
+ show_dimensions=show_dimensions,
+ )
+
+ return buf.getvalue()
+
+ def _repr_html_(self):
+ """
+ Return a html representation for a particular DataFrame.
+
+ Mainly for IPython notebook.
+ """
+ if self._info_repr():
+ buf = StringIO("")
+ self.info(buf=buf)
+ # need to escape the <class>, should be the first line.
+ val = buf.getvalue().replace("<", r"<", 1)
+ val = val.replace(">", r">", 1)
+ return "<pre>" + val + "</pre>"
+
+ if get_option("display.notebook_repr_html"):
+ max_rows = get_option("display.max_rows")
+ max_cols = get_option("display.max_columns")
+ show_dimensions = get_option("display.show_dimensions")
+
+ return self.to_html(
+ max_rows=max_rows,
+ max_cols=max_cols,
+ show_dimensions=show_dimensions,
+ notebook=True,
+ )
+ else:
+ return None
+
+ @Substitution(
+ header="Write out the column names. If a list of strings "
+ "is given, it is assumed to be aliases for the "
+ "column names",
+ col_space_type="int",
+ col_space="The minimum width of each column",
+ )
+ @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
+ def to_string(
+ self,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="NaN",
+ formatters=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ justify=None,
+ max_rows=None,
+ min_rows=None,
+ max_cols=None,
+ show_dimensions=False,
+ decimal=".",
+ line_width=None,
+ ):
+ """
+ Render a DataFrame to a console-friendly tabular output.
+ %(shared_params)s
+ line_width : int, optional
+ Width to wrap a line in characters.
+ %(returns)s
+ See Also
+ --------
+ to_html : Convert DataFrame to HTML.
+
+ Examples
+ --------
+ >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
+ >>> df = pd.DataFrame(d)
+ >>> print(df.to_string())
+ col1 col2
+ 0 1 4
+ 1 2 5
+ 2 3 6
+ """
+
+ formatter = fmt.DataFrameFormatter(
+ self,
+ buf=buf,
+ columns=columns,
+ col_space=col_space,
+ na_rep=na_rep,
+ formatters=formatters,
+ float_format=float_format,
+ sparsify=sparsify,
+ justify=justify,
+ index_names=index_names,
+ header=header,
+ index=index,
+ min_rows=min_rows,
+ max_rows=max_rows,
+ max_cols=max_cols,
+ show_dimensions=show_dimensions,
+ decimal=decimal,
+ line_width=line_width,
+ )
+ formatter.to_string()
+
+ if buf is None:
+ result = formatter.buf.getvalue()
+ return result
+
+ # ----------------------------------------------------------------------
+
+ @property
+ def style(self):
+ """
+ Property returning a Styler object containing methods for
+ building a styled HTML representation fo the DataFrame.
+
+ See Also
+ --------
+ io.formats.style.Styler
+ """
+ from pandas.io.formats.style import Styler
+
+ return Styler(self)
+
+ _shared_docs[
+ "items"
+ ] = r"""
+ Iterator over (column name, Series) pairs.
+
+ Iterates over the DataFrame columns, returning a tuple with
+ the column name and the content as a Series.
+
+ %s
+ label : object
+ The column names for the DataFrame being iterated over.
+ content : Series
+ The column entries belonging to each label, as a Series.
+
+ See Also
+ --------
+ DataFrame.iterrows : Iterate over DataFrame rows as
+ (index, Series) pairs.
+ DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
+ of the values.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
+ ... 'population': [1864, 22000, 80000]},
+ ... index=['panda', 'polar', 'koala'])
+ >>> df
+ species population
+ panda bear 1864
+ polar bear 22000
+ koala marsupial 80000
+ >>> for label, content in df.items():
+ ... print('label:', label)
+ ... print('content:', content, sep='\n')
+ ...
+ label: species
+ content:
+ panda bear
+ polar bear
+ koala marsupial
+ Name: species, dtype: object
+ label: population
+ content:
+ panda 1864
+ polar 22000
+ koala 80000
+ Name: population, dtype: int64
+ """
+
+ @Appender(_shared_docs["items"] % "Yields\n ------")
+ def items(self):
+ if self.columns.is_unique and hasattr(self, "_item_cache"):
+ for k in self.columns:
+ yield k, self._get_item_cache(k)
+ else:
+ for i, k in enumerate(self.columns):
+ yield k, self._ixs(i, axis=1)
+
+ @Appender(_shared_docs["items"] % "Returns\n -------")
+ def iteritems(self):
+ return self.items()
+
+ def iterrows(self):
+ """
+ Iterate over DataFrame rows as (index, Series) pairs.
+
+ Yields
+ ------
+ index : label or tuple of label
+ The index of the row. A tuple for a `MultiIndex`.
+ data : Series
+ The data of the row as a Series.
+
+ it : generator
+ A generator that iterates over the rows of the frame.
+
+ See Also
+ --------
+ itertuples : Iterate over DataFrame rows as namedtuples of the values.
+ items : Iterate over (column name, Series) pairs.
+
+ Notes
+ -----
+
+ 1. Because ``iterrows`` returns a Series for each row,
+ it does **not** preserve dtypes across the rows (dtypes are
+ preserved across columns for DataFrames). For example,
+
+ >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
+ >>> row = next(df.iterrows())[1]
+ >>> row
+ int 1.0
+ float 1.5
+ Name: 0, dtype: float64
+ >>> print(row['int'].dtype)
+ float64
+ >>> print(df['int'].dtype)
+ int64
+
+ To preserve dtypes while iterating over the rows, it is better
+ to use :meth:`itertuples` which returns namedtuples of the values
+ and which is generally faster than ``iterrows``.
+
+ 2. You should **never modify** something you are iterating over.
+ This is not guaranteed to work in all cases. Depending on the
+ data types, the iterator returns a copy and not a view, and writing
+ to it will have no effect.
+ """
+ columns = self.columns
+ klass = self._constructor_sliced
+ for k, v in zip(self.index, self.values):
+ s = klass(v, index=columns, name=k)
+ yield k, s
+
+ def itertuples(self, index=True, name="Pandas"):
+ """
+ Iterate over DataFrame rows as namedtuples.
+
+ Parameters
+ ----------
+ index : bool, default True
+ If True, return the index as the first element of the tuple.
+ name : str or None, default "Pandas"
+ The name of the returned namedtuples or None to return regular
+ tuples.
+
+ Returns
+ -------
+ iterator
+ An object to iterate over namedtuples for each row in the
+ DataFrame with the first field possibly being the index and
+ following fields being the column values.
+
+ See Also
+ --------
+ DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
+ pairs.
+ DataFrame.items : Iterate over (column name, Series) pairs.
+
+ Notes
+ -----
+ The column names will be renamed to positional names if they are
+ invalid Python identifiers, repeated, or start with an underscore.
+ With a large number of columns (>255), regular tuples are returned.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
+ ... index=['dog', 'hawk'])
+ >>> df
+ num_legs num_wings
+ dog 4 0
+ hawk 2 2
+ >>> for row in df.itertuples():
+ ... print(row)
+ ...
+ Pandas(Index='dog', num_legs=4, num_wings=0)
+ Pandas(Index='hawk', num_legs=2, num_wings=2)
+
+ By setting the `index` parameter to False we can remove the index
+ as the first element of the tuple:
+
+ >>> for row in df.itertuples(index=False):
+ ... print(row)
+ ...
+ Pandas(num_legs=4, num_wings=0)
+ Pandas(num_legs=2, num_wings=2)
+
+ With the `name` parameter set we set a custom name for the yielded
+ namedtuples:
+
+ >>> for row in df.itertuples(name='Animal'):
+ ... print(row)
+ ...
+ Animal(Index='dog', num_legs=4, num_wings=0)
+ Animal(Index='hawk', num_legs=2, num_wings=2)
+ """
+ arrays = []
+ fields = list(self.columns)
+ if index:
+ arrays.append(self.index)
+ fields.insert(0, "Index")
+
+ # use integer indexing because of possible duplicate column names
+ arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
+
+ # Python 3 supports at most 255 arguments to constructor
+ if name is not None and len(self.columns) + index < 256:
+ itertuple = collections.namedtuple(name, fields, rename=True)
+ return map(itertuple._make, zip(*arrays))
+
+ # fallback to regular tuples
+ return zip(*arrays)
+
+ def __len__(self):
+ """
+ Returns length of info axis, but here we use the index.
+ """
+ return len(self.index)
+
+ def dot(self, other):
+ """
+ Compute the matrix multiplication between the DataFrame and other.
+
+ This method computes the matrix product between the DataFrame and the
+ values of an other Series, DataFrame or a numpy array.
+
+ It can also be called using ``self @ other`` in Python >= 3.5.
+
+ Parameters
+ ----------
+ other : Series, DataFrame or array-like
+ The other object to compute the matrix product with.
+
+ Returns
+ -------
+ Series or DataFrame
+ If other is a Series, return the matrix product between self and
+ other as a Serie. If other is a DataFrame or a numpy.array, return
+ the matrix product of self and other in a DataFrame of a np.array.
+
+ See Also
+ --------
+ Series.dot: Similar method for Series.
+
+ Notes
+ -----
+ The dimensions of DataFrame and other must be compatible in order to
+ compute the matrix multiplication. In addition, the column names of
+ DataFrame and the index of other must contain the same values, as they
+ will be aligned prior to the multiplication.
+
+ The dot method for Series computes the inner product, instead of the
+ matrix product here.
+
+ Examples
+ --------
+ Here we multiply a DataFrame with a Series.
+
+ >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
+ >>> s = pd.Series([1, 1, 2, 1])
+ >>> df.dot(s)
+ 0 -4
+ 1 5
+ dtype: int64
+
+ Here we multiply a DataFrame with another DataFrame.
+
+ >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
+ >>> df.dot(other)
+ 0 1
+ 0 1 4
+ 1 2 2
+
+ Note that the dot method give the same result as @
+
+ >>> df @ other
+ 0 1
+ 0 1 4
+ 1 2 2
+
+ The dot method works also if other is an np.array.
+
+ >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
+ >>> df.dot(arr)
+ 0 1
+ 0 1 4
+ 1 2 2
+
+ Note how shuffling of the objects does not change the result.
+
+ >>> s2 = s.reindex([1, 0, 2, 3])
+ >>> df.dot(s2)
+ 0 -4
+ 1 5
+ dtype: int64
+ """
+ if isinstance(other, (Series, DataFrame)):
+ common = self.columns.union(other.index)
+ if len(common) > len(self.columns) or len(common) > len(other.index):
+ raise ValueError("matrices are not aligned")
+
+ left = self.reindex(columns=common, copy=False)
+ right = other.reindex(index=common, copy=False)
+ lvals = left.values
+ rvals = right.values
+ else:
+ left = self
+ lvals = self.values
+ rvals = np.asarray(other)
+ if lvals.shape[1] != rvals.shape[0]:
+ raise ValueError(
+ "Dot product shape mismatch, "
+ "{s} vs {r}".format(s=lvals.shape, r=rvals.shape)
+ )
+
+ if isinstance(other, DataFrame):
+ return self._constructor(
+ np.dot(lvals, rvals), index=left.index, columns=other.columns
+ )
+ elif isinstance(other, Series):
+ return Series(np.dot(lvals, rvals), index=left.index)
+ elif isinstance(rvals, (np.ndarray, Index)):
+ result = np.dot(lvals, rvals)
+ if result.ndim == 2:
+ return self._constructor(result, index=left.index)
+ else:
+ return Series(result, index=left.index)
+ else: # pragma: no cover
+ raise TypeError("unsupported type: {oth}".format(oth=type(other)))
+
+ def __matmul__(self, other):
+ """
+ Matrix multiplication using binary `@` operator in Python>=3.5.
+ """
+ return self.dot(other)
+
+ def __rmatmul__(self, other):
+ """
+ Matrix multiplication using binary `@` operator in Python>=3.5.
+ """
+ return self.T.dot(np.transpose(other)).T
+
+ # ----------------------------------------------------------------------
+ # IO methods (to / from other formats)
+
+ @classmethod
+ def from_dict(cls, data, orient="columns", dtype=None, columns=None):
+ """
+ Construct DataFrame from dict of array-like or dicts.
+
+ Creates DataFrame object from dictionary by columns or by index
+ allowing dtype specification.
+
+ Parameters
+ ----------
+ data : dict
+ Of the form {field : array-like} or {field : dict}.
+ orient : {'columns', 'index'}, default 'columns'
+ The "orientation" of the data. If the keys of the passed dict
+ should be the columns of the resulting DataFrame, pass 'columns'
+ (default). Otherwise if the keys should be rows, pass 'index'.
+ dtype : dtype, default None
+ Data type to force, otherwise infer.
+ columns : list, default None
+ Column labels to use when ``orient='index'``. Raises a ValueError
+ if used with ``orient='columns'``.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ DataFrame.from_records : DataFrame from ndarray (structured
+ dtype), list of tuples, dict, or DataFrame.
+ DataFrame : DataFrame object creation using constructor.
+
+ Examples
+ --------
+ By default the keys of the dict become the DataFrame columns:
+
+ >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
+ >>> pd.DataFrame.from_dict(data)
+ col_1 col_2
+ 0 3 a
+ 1 2 b
+ 2 1 c
+ 3 0 d
+
+ Specify ``orient='index'`` to create the DataFrame using dictionary
+ keys as rows:
+
+ >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
+ >>> pd.DataFrame.from_dict(data, orient='index')
+ 0 1 2 3
+ row_1 3 2 1 0
+ row_2 a b c d
+
+ When using the 'index' orientation, the column names can be
+ specified manually:
+
+ >>> pd.DataFrame.from_dict(data, orient='index',
+ ... columns=['A', 'B', 'C', 'D'])
+ A B C D
+ row_1 3 2 1 0
+ row_2 a b c d
+ """
+ index = None
+ orient = orient.lower()
+ if orient == "index":
+ if len(data) > 0:
+ # TODO speed up Series case
+ if isinstance(list(data.values())[0], (Series, dict)):
+ data = _from_nested_dict(data)
+ else:
+ data, index = list(data.values()), list(data.keys())
+ elif orient == "columns":
+ if columns is not None:
+ raise ValueError(
+ "cannot use columns parameter with " "orient='columns'"
+ )
+ else: # pragma: no cover
+ raise ValueError("only recognize index or columns for orient")
+
+ return cls(data, index=index, columns=columns, dtype=dtype)
+
+ def to_numpy(self, dtype=None, copy=False):
+ """
+ Convert the DataFrame to a NumPy array.
+
+ .. versionadded:: 0.24.0
+
+ By default, the dtype of the returned array will be the common NumPy
+ dtype of all types in the DataFrame. For example, if the dtypes are
+ ``float16`` and ``float32``, the results dtype will be ``float32``.
+ This may require copying data and coercing values, which may be
+ expensive.
+
+ Parameters
+ ----------
+ dtype : str or numpy.dtype, optional
+ The dtype to pass to :meth:`numpy.asarray`
+ copy : bool, default False
+ Whether to ensure that the returned value is a not a view on
+ another array. Note that ``copy=False`` does not *ensure* that
+ ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+ a copy is made, even if not strictly necessary.
+
+ Returns
+ -------
+ numpy.ndarray
+
+ See Also
+ --------
+ Series.to_numpy : Similar method for Series.
+
+ Examples
+ --------
+ >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
+ array([[1, 3],
+ [2, 4]])
+
+ With heterogenous data, the lowest common type will have to
+ be used.
+
+ >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
+ >>> df.to_numpy()
+ array([[1. , 3. ],
+ [2. , 4.5]])
+
+ For a mix of numeric and non-numeric types, the output array will
+ have object dtype.
+
+ >>> df['C'] = pd.date_range('2000', periods=2)
+ >>> df.to_numpy()
+ array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
+ [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
+ """
+ result = np.array(self.values, dtype=dtype, copy=copy)
+ return result
+
+ def to_dict(self, orient="dict", into=dict):
+ """
+ Convert the DataFrame to a dictionary.
+
+ The type of the key-value pairs can be customized with the parameters
+ (see below).
+
+ Parameters
+ ----------
+ orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
+ Determines the type of the values of the dictionary.
+
+ - 'dict' (default) : dict like {column -> {index -> value}}
+ - 'list' : dict like {column -> [values]}
+ - 'series' : dict like {column -> Series(values)}
+ - 'split' : dict like
+ {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
+ - 'records' : list like
+ [{column -> value}, ... , {column -> value}]
+ - 'index' : dict like {index -> {column -> value}}
+
+ Abbreviations are allowed. `s` indicates `series` and `sp`
+ indicates `split`.
+
+ into : class, default dict
+ The collections.abc.Mapping subclass used for all Mappings
+ in the return value. Can be the actual class or an empty
+ instance of the mapping type you want. If you want a
+ collections.defaultdict, you must pass it initialized.
+
+ .. versionadded:: 0.21.0
+
+ Returns
+ -------
+ dict, list or collections.abc.Mapping
+ Return a collections.abc.Mapping object representing the DataFrame.
+ The resulting transformation depends on the `orient` parameter.
+
+ See Also
+ --------
+ DataFrame.from_dict: Create a DataFrame from a dictionary.
+ DataFrame.to_json: Convert a DataFrame to JSON format.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'col1': [1, 2],
+ ... 'col2': [0.5, 0.75]},
+ ... index=['row1', 'row2'])
+ >>> df
+ col1 col2
+ row1 1 0.50
+ row2 2 0.75
+ >>> df.to_dict()
+ {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
+
+ You can specify the return orientation.
+
+ >>> df.to_dict('series')
+ {'col1': row1 1
+ row2 2
+ Name: col1, dtype: int64,
+ 'col2': row1 0.50
+ row2 0.75
+ Name: col2, dtype: float64}
+
+ >>> df.to_dict('split')
+ {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+ 'data': [[1, 0.5], [2, 0.75]]}
+
+ >>> df.to_dict('records')
+ [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
+
+ >>> df.to_dict('index')
+ {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
+
+ You can also specify the mapping type.
+
+ >>> from collections import OrderedDict, defaultdict
+ >>> df.to_dict(into=OrderedDict)
+ OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
+ ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
+
+ If you want a `defaultdict`, you need to initialize it:
+
+ >>> dd = defaultdict(list)
+ >>> df.to_dict('records', into=dd)
+ [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
+ defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
+ """
+ if not self.columns.is_unique:
+ warnings.warn(
+ "DataFrame columns are not unique, some " "columns will be omitted.",
+ UserWarning,
+ stacklevel=2,
+ )
+ # GH16122
+ into_c = com.standardize_mapping(into)
+ if orient.lower().startswith("d"):
+ return into_c((k, v.to_dict(into)) for k, v in self.items())
+ elif orient.lower().startswith("l"):
+ return into_c((k, v.tolist()) for k, v in self.items())
+ elif orient.lower().startswith("sp"):
+ return into_c(
+ (
+ ("index", self.index.tolist()),
+ ("columns", self.columns.tolist()),
+ (
+ "data",
+ [
+ list(map(com.maybe_box_datetimelike, t))
+ for t in self.itertuples(index=False, name=None)
+ ],
+ ),
+ )
+ )
+ elif orient.lower().startswith("s"):
+ return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items())
+ elif orient.lower().startswith("r"):
+ columns = self.columns.tolist()
+ rows = (
+ dict(zip(columns, row))
+ for row in self.itertuples(index=False, name=None)
+ )
+ return [
+ into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items())
+ for row in rows
+ ]
+ elif orient.lower().startswith("i"):
+ if not self.index.is_unique:
+ raise ValueError("DataFrame index must be unique for orient='index'.")
+ return into_c(
+ (t[0], dict(zip(self.columns, t[1:])))
+ for t in self.itertuples(name=None)
+ )
+ else:
+ raise ValueError("orient '{o}' not understood".format(o=orient))
+
+ def to_gbq(
+ self,
+ destination_table,
+ project_id=None,
+ chunksize=None,
+ reauth=False,
+ if_exists="fail",
+ auth_local_webserver=False,
+ table_schema=None,
+ location=None,
+ progress_bar=True,
+ credentials=None,
+ verbose=None,
+ private_key=None,
+ ):
+ """
+ Write a DataFrame to a Google BigQuery table.
+
+ This function requires the `pandas-gbq package
+ <https://pandas-gbq.readthedocs.io>`__.
+
+ See the `How to authenticate with Google BigQuery
+ <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
+ guide for authentication instructions.
+
+ Parameters
+ ----------
+ destination_table : str
+ Name of table to be written, in the form ``dataset.tablename``.
+ project_id : str, optional
+ Google BigQuery Account project ID. Optional when available from
+ the environment.
+ chunksize : int, optional
+ Number of rows to be inserted in each chunk from the dataframe.
+ Set to ``None`` to load the whole dataframe at once.
+ reauth : bool, default False
+ Force Google BigQuery to re-authenticate the user. This is useful
+ if multiple accounts are used.
+ if_exists : str, default 'fail'
+ Behavior when the destination table exists. Value can be one of:
+
+ ``'fail'``
+ If table exists, do nothing.
+ ``'replace'``
+ If table exists, drop it, recreate it, and insert data.
+ ``'append'``
+ If table exists, insert data. Create if does not exist.
+ auth_local_webserver : bool, default False
+ Use the `local webserver flow`_ instead of the `console flow`_
+ when getting user credentials.
+
+ .. _local webserver flow:
+ http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
+ .. _console flow:
+ http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
+
+ *New in version 0.2.0 of pandas-gbq*.
+ table_schema : list of dicts, optional
+ List of BigQuery table fields to which according DataFrame
+ columns conform to, e.g. ``[{'name': 'col1', 'type':
+ 'STRING'},...]``. If schema is not provided, it will be
+ generated according to dtypes of DataFrame columns. See
+ BigQuery API documentation on available names of a field.
+
+ *New in version 0.3.1 of pandas-gbq*.
+ location : str, optional
+ Location where the load job should run. See the `BigQuery locations
+ documentation
+ <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
+ list of available locations. The location must match that of the
+ target dataset.
+
+ *New in version 0.5.0 of pandas-gbq*.
+ progress_bar : bool, default True
+ Use the library `tqdm` to show the progress bar for the upload,
+ chunk by chunk.
+
+ *New in version 0.5.0 of pandas-gbq*.
+ credentials : google.auth.credentials.Credentials, optional
+ Credentials for accessing Google APIs. Use this parameter to
+ override default credentials, such as to use Compute Engine
+ :class:`google.auth.compute_engine.Credentials` or Service
+ Account :class:`google.oauth2.service_account.Credentials`
+ directly.
+
+ *New in version 0.8.0 of pandas-gbq*.
+
+ .. versionadded:: 0.24.0
+ verbose : bool, deprecated
+ Deprecated in pandas-gbq version 0.4.0. Use the `logging module
+ to adjust verbosity instead
+ <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
+ private_key : str, deprecated
+ Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
+ parameter and
+ :func:`google.oauth2.service_account.Credentials.from_service_account_info`
+ or
+ :func:`google.oauth2.service_account.Credentials.from_service_account_file`
+ instead.
+
+ Service account private key in JSON format. Can be file path
+ or string contents. This is useful for remote server
+ authentication (eg. Jupyter/IPython notebook on remote host).
+
+ See Also
+ --------
+ pandas_gbq.to_gbq : This function in the pandas-gbq library.
+ read_gbq : Read a DataFrame from Google BigQuery.
+ """
+ from pandas.io import gbq
+
+ gbq.to_gbq(
+ self,
+ destination_table,
+ project_id=project_id,
+ chunksize=chunksize,
+ reauth=reauth,
+ if_exists=if_exists,
+ auth_local_webserver=auth_local_webserver,
+ table_schema=table_schema,
+ location=location,
+ progress_bar=progress_bar,
+ credentials=credentials,
+ verbose=verbose,
+ private_key=private_key,
+ )
+
+ @classmethod
+ def from_records(
+ cls,
+ data,
+ index=None,
+ exclude=None,
+ columns=None,
+ coerce_float=False,
+ nrows=None,
+ ):
+ """
+ Convert structured or record ndarray to DataFrame.
+
+ Parameters
+ ----------
+ data : ndarray (structured dtype), list of tuples, dict, or DataFrame
+ index : string, list of fields, array-like
+ Field of array to use as the index, alternately a specific set of
+ input labels to use
+ exclude : sequence, default None
+ Columns or fields to exclude
+ columns : sequence, default None
+ Column names to use. If the passed data do not have names
+ associated with them, this argument provides names for the
+ columns. Otherwise this argument indicates the order of the columns
+ in the result (any names not found in the data will become all-NA
+ columns)
+ coerce_float : boolean, default False
+ Attempt to convert values of non-string, non-numeric objects (like
+ decimal.Decimal) to floating point, useful for SQL result sets
+ nrows : int, default None
+ Number of rows to read if data is an iterator
+
+ Returns
+ -------
+ DataFrame
+ """
+
+ # Make a copy of the input columns so we can modify it
+ if columns is not None:
+ columns = ensure_index(columns)
+
+ if is_iterator(data):
+ if nrows == 0:
+ return cls()
+
+ try:
+ first_row = next(data)
+ except StopIteration:
+ return cls(index=index, columns=columns)
+
+ dtype = None
+ if hasattr(first_row, "dtype") and first_row.dtype.names:
+ dtype = first_row.dtype
+
+ values = [first_row]
+
+ if nrows is None:
+ values += data
+ else:
+ values.extend(itertools.islice(data, nrows - 1))
+
+ if dtype is not None:
+ data = np.array(values, dtype=dtype)
+ else:
+ data = values
+
+ if isinstance(data, dict):
+ if columns is None:
+ columns = arr_columns = ensure_index(sorted(data))
+ arrays = [data[k] for k in columns]
+ else:
+ arrays = []
+ arr_columns = []
+ for k, v in data.items():
+ if k in columns:
+ arr_columns.append(k)
+ arrays.append(v)
+
+ arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns)
+
+ elif isinstance(data, (np.ndarray, DataFrame)):
+ arrays, columns = to_arrays(data, columns)
+ if columns is not None:
+ columns = ensure_index(columns)
+ arr_columns = columns
+ else:
+ arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float)
+
+ arr_columns = ensure_index(arr_columns)
+ if columns is not None:
+ columns = ensure_index(columns)
+ else:
+ columns = arr_columns
+
+ if exclude is None:
+ exclude = set()
+ else:
+ exclude = set(exclude)
+
+ result_index = None
+ if index is not None:
+ if isinstance(index, str) or not hasattr(index, "__iter__"):
+ i = columns.get_loc(index)
+ exclude.add(index)
+ if len(arrays) > 0:
+ result_index = Index(arrays[i], name=index)
+ else:
+ result_index = Index([], name=index)
+ else:
+ try:
+ index_data = [arrays[arr_columns.get_loc(field)] for field in index]
+ result_index = ensure_index_from_sequences(index_data, names=index)
+
+ exclude.update(index)
+ except Exception:
+ result_index = index
+
+ if any(exclude):
+ arr_exclude = [x for x in exclude if x in arr_columns]
+ to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
+ arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
+
+ arr_columns = arr_columns.drop(arr_exclude)
+ columns = columns.drop(exclude)
+
+ mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
+
+ return cls(mgr)
+
+ def to_records(
+ self, index=True, convert_datetime64=None, column_dtypes=None, index_dtypes=None
+ ):
+ """
+ Convert DataFrame to a NumPy record array.
+
+ Index will be included as the first field of the record array if
+ requested.
+
+ Parameters
+ ----------
+ index : bool, default True
+ Include index in resulting record array, stored in 'index'
+ field or using the index label, if set.
+ convert_datetime64 : bool, default None
+ .. deprecated:: 0.23.0
+
+ Whether to convert the index to datetime.datetime if it is a
+ DatetimeIndex.
+ column_dtypes : str, type, dict, default None
+ .. versionadded:: 0.24.0
+
+ If a string or type, the data type to store all columns. If
+ a dictionary, a mapping of column names and indices (zero-indexed)
+ to specific data types.
+ index_dtypes : str, type, dict, default None
+ .. versionadded:: 0.24.0
+
+ If a string or type, the data type to store all index levels. If
+ a dictionary, a mapping of index level names and indices
+ (zero-indexed) to specific data types.
+
+ This mapping is applied only if `index=True`.
+
+ Returns
+ -------
+ numpy.recarray
+ NumPy ndarray with the DataFrame labels as fields and each row
+ of the DataFrame as entries.
+
+ See Also
+ --------
+ DataFrame.from_records: Convert structured or record ndarray
+ to DataFrame.
+ numpy.recarray: An ndarray that allows field access using
+ attributes, analogous to typed columns in a
+ spreadsheet.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
+ ... index=['a', 'b'])
+ >>> df
+ A B
+ a 1 0.50
+ b 2 0.75
+ >>> df.to_records()
+ rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
+
+ If the DataFrame index has no label then the recarray field name
+ is set to 'index'. If the index has a label then this is used as the
+ field name:
+
+ >>> df.index = df.index.rename("I")
+ >>> df.to_records()
+ rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+ dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
+
+ The index can be excluded from the record array:
+
+ >>> df.to_records(index=False)
+ rec.array([(1, 0.5 ), (2, 0.75)],
+ dtype=[('A', '<i8'), ('B', '<f8')])
+
+ Data types can be specified for the columns:
+
+ >>> df.to_records(column_dtypes={"A": "int32"})
+ rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+ dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
+
+ As well as for the index:
+
+ >>> df.to_records(index_dtypes="<S2")
+ rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+ dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
+
+ >>> index_dtypes = "<S{}".format(df.index.str.len().max())
+ >>> df.to_records(index_dtypes=index_dtypes)
+ rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+ dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
+ """
+
+ if convert_datetime64 is not None:
+ warnings.warn(
+ "The 'convert_datetime64' parameter is "
+ "deprecated and will be removed in a future "
+ "version",
+ FutureWarning,
+ stacklevel=2,
+ )
+
+ if index:
+ if is_datetime64_any_dtype(self.index) and convert_datetime64:
+ ix_vals = [self.index.to_pydatetime()]
+ else:
+ if isinstance(self.index, MultiIndex):
+ # array of tuples to numpy cols. copy copy copy
+ ix_vals = list(map(np.array, zip(*self.index.values)))
+ else:
+ ix_vals = [self.index.values]
+
+ arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns]
+
+ count = 0
+ index_names = list(self.index.names)
+
+ if isinstance(self.index, MultiIndex):
+ for i, n in enumerate(index_names):
+ if n is None:
+ index_names[i] = "level_%d" % count
+ count += 1
+ elif index_names[0] is None:
+ index_names = ["index"]
+
+ names = [str(name) for name in itertools.chain(index_names, self.columns)]
+ else:
+ arrays = [self[c]._internal_get_values() for c in self.columns]
+ names = [str(c) for c in self.columns]
+ index_names = []
+
+ index_len = len(index_names)
+ formats = []
+
+ for i, v in enumerate(arrays):
+ index = i
+
+ # When the names and arrays are collected, we
+ # first collect those in the DataFrame's index,
+ # followed by those in its columns.
+ #
+ # Thus, the total length of the array is:
+ # len(index_names) + len(DataFrame.columns).
+ #
+ # This check allows us to see whether we are
+ # handling a name / array in the index or column.
+ if index < index_len:
+ dtype_mapping = index_dtypes
+ name = index_names[index]
+ else:
+ index -= index_len
+ dtype_mapping = column_dtypes
+ name = self.columns[index]
+
+ # We have a dictionary, so we get the data type
+ # associated with the index or column (which can
+ # be denoted by its name in the DataFrame or its
+ # position in DataFrame's array of indices or
+ # columns, whichever is applicable.
+ if is_dict_like(dtype_mapping):
+ if name in dtype_mapping:
+ dtype_mapping = dtype_mapping[name]
+ elif index in dtype_mapping:
+ dtype_mapping = dtype_mapping[index]
+ else:
+ dtype_mapping = None
+
+ # If no mapping can be found, use the array's
+ # dtype attribute for formatting.
+ #
+ # A valid dtype must either be a type or
+ # string naming a type.
+ if dtype_mapping is None:
+ formats.append(v.dtype)
+ elif isinstance(dtype_mapping, (type, np.dtype, str)):
+ formats.append(dtype_mapping)
+ else:
+ element = "row" if i < index_len else "column"
+ msg = (
+ "Invalid dtype {dtype} specified for " "{element} {name}"
+ ).format(dtype=dtype_mapping, element=element, name=name)
+ raise ValueError(msg)
+
+ return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
+
+ @classmethod
+ def from_items(cls, items, columns=None, orient="columns"):
+ """
+ Construct a DataFrame from a list of tuples.
+
+ .. deprecated:: 0.23.0
+ `from_items` is deprecated and will be removed in a future version.
+ Use :meth:`DataFrame.from_dict(dict(items)) <DataFrame.from_dict>`
+ instead.
+ :meth:`DataFrame.from_dict(OrderedDict(items)) <DataFrame.from_dict>`
+ may be used to preserve the key order.
+
+ Convert (key, value) pairs to DataFrame. The keys will be the axis
+ index (usually the columns, but depends on the specified
+ orientation). The values should be arrays or Series.
+
+ Parameters
+ ----------
+ items : sequence of (key, value) pairs
+ Values should be arrays or Series.
+ columns : sequence of column labels, optional
+ Must be passed if orient='index'.
+ orient : {'columns', 'index'}, default 'columns'
+ The "orientation" of the data. If the keys of the
+ input correspond to column labels, pass 'columns'
+ (default). Otherwise if the keys correspond to the index,
+ pass 'index'.
+
+ Returns
+ -------
+ DataFrame
+ """
+
+ warnings.warn(
+ "from_items is deprecated. Please use "
+ "DataFrame.from_dict(dict(items), ...) instead. "
+ "DataFrame.from_dict(OrderedDict(items)) may be used to "
+ "preserve the key order.",
+ FutureWarning,
+ stacklevel=2,
+ )
+
+ keys, values = zip(*items)
+
+ if orient == "columns":
+ if columns is not None:
+ columns = ensure_index(columns)
+
+ idict = dict(items)
+ if len(idict) < len(items):
+ if not columns.equals(ensure_index(keys)):
+ raise ValueError(
+ "With non-unique item names, passed "
+ "columns must be identical"
+ )
+ arrays = values
+ else:
+ arrays = [idict[k] for k in columns if k in idict]
+ else:
+ columns = ensure_index(keys)
+ arrays = values
+
+ # GH 17312
+ # Provide more informative error msg when scalar values passed
+ try:
+ return cls._from_arrays(arrays, columns, None)
+
+ except ValueError:
+ if not is_nested_list_like(values):
+ raise ValueError(
+ "The value in each (key, value) pair "
+ "must be an array, Series, or dict"
+ )
+
+ elif orient == "index":
+ if columns is None:
+ raise TypeError("Must pass columns with orient='index'")
+
+ keys = ensure_index(keys)
+
+ # GH 17312
+ # Provide more informative error msg when scalar values passed
+ try:
+ arr = np.array(values, dtype=object).T
+ data = [lib.maybe_convert_objects(v) for v in arr]
+ return cls._from_arrays(data, columns, keys)
+
+ except TypeError:
+ if not is_nested_list_like(values):
+ raise ValueError(
+ "The value in each (key, value) pair "
+ "must be an array, Series, or dict"
+ )
+
+ else: # pragma: no cover
+ raise ValueError("'orient' must be either 'columns' or 'index'")
+
+ @classmethod
+ def _from_arrays(cls, arrays, columns, index, dtype=None):
+ mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
+ return cls(mgr)
+
+ def to_sparse(self, fill_value=None, kind="block"):
+ """
+ Convert to SparseDataFrame.
+
+ .. deprecated:: 0.25.0
+
+ Implement the sparse version of the DataFrame meaning that any data
+ matching a specific value it's omitted in the representation.
+ The sparse DataFrame allows for a more efficient storage.
+
+ Parameters
+ ----------
+ fill_value : float, default None
+ The specific value that should be omitted in the representation.
+ kind : {'block', 'integer'}, default 'block'
+ The kind of the SparseIndex tracking where data is not equal to
+ the fill value:
+
+ - 'block' tracks only the locations and sizes of blocks of data.
+ - 'integer' keeps an array with all the locations of the data.
+
+ In most cases 'block' is recommended, since it's more memory
+ efficient.
+
+ Returns
+ -------
+ SparseDataFrame
+ The sparse representation of the DataFrame.
+
+ See Also
+ --------
+ DataFrame.to_dense :
+ Converts the DataFrame back to the its dense form.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([(np.nan, np.nan),
+ ... (1., np.nan),
+ ... (np.nan, 1.)])
+ >>> df
+ 0 1
+ 0 NaN NaN
+ 1 1.0 NaN
+ 2 NaN 1.0
+ >>> type(df)
+ <class 'pandas.core.frame.DataFrame'>
+
+ >>> sdf = df.to_sparse() # doctest: +SKIP
+ >>> sdf # doctest: +SKIP
+ 0 1
+ 0 NaN NaN
+ 1 1.0 NaN
+ 2 NaN 1.0
+ >>> type(sdf) # doctest: +SKIP
+ <class 'pandas.core.sparse.frame.SparseDataFrame'>
+ """
+ warnings.warn(
+ "DataFrame.to_sparse is deprecated and will be removed "
+ "in a future version",
+ FutureWarning,
+ stacklevel=2,
+ )
+
+ from pandas.core.sparse.api import SparseDataFrame
+
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", message="SparseDataFrame")
+ return SparseDataFrame(
+ self._series,
+ index=self.index,
+ columns=self.columns,
+ default_kind=kind,
+ default_fill_value=fill_value,
+ )
+
+ @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None)
+ def to_stata(
+ self,
+ fname,
+ convert_dates=None,
+ write_index=True,
+ encoding="latin-1",
+ byteorder=None,
+ time_stamp=None,
+ data_label=None,
+ variable_labels=None,
+ version=114,
+ convert_strl=None,
+ ):
+ """
+ Export DataFrame object to Stata dta format.
+
+ Writes the DataFrame to a Stata dataset file.
+ "dta" files contain a Stata dataset.
+
+ Parameters
+ ----------
+ fname : str, buffer or path object
+ String, path object (pathlib.Path or py._path.local.LocalPath) or
+ object implementing a binary write() function. If using a buffer
+ then the buffer will not be automatically closed after the file
+ data has been written.
+ convert_dates : dict
+ Dictionary mapping columns containing datetime types to stata
+ internal format to use when writing the dates. Options are 'tc',
+ 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
+ or a name. Datetime columns that do not have a conversion type
+ specified will be converted to 'tc'. Raises NotImplementedError if
+ a datetime column has timezone information.
+ write_index : bool
+ Write the index to Stata dataset.
+ encoding : str
+ Default is latin-1. Unicode is not supported.
+ byteorder : str
+ Can be ">", "<", "little", or "big". default is `sys.byteorder`.
+ time_stamp : datetime
+ A datetime to use as file creation date. Default is the current
+ time.
+ data_label : str, optional
+ A label for the data set. Must be 80 characters or smaller.
+ variable_labels : dict
+ Dictionary containing columns as keys and variable labels as
+ values. Each label must be 80 characters or smaller.
+
+ .. versionadded:: 0.19.0
+
+ version : {114, 117}, default 114
+ Version to use in the output dta file. Version 114 can be used
+ read by Stata 10 and later. Version 117 can be read by Stata 13
+ or later. Version 114 limits string variables to 244 characters or
+ fewer while 117 allows strings with lengths up to 2,000,000
+ characters.
+
+ .. versionadded:: 0.23.0
+
+ convert_strl : list, optional
+ List of column names to convert to string columns to Stata StrL
+ format. Only available if version is 117. Storing strings in the
+ StrL format can produce smaller dta files if strings have more than
+ 8 characters and values are repeated.
+
+ .. versionadded:: 0.23.0
+
+ Raises
+ ------
+ NotImplementedError
+ * If datetimes contain timezone information
+ * Column dtype is not representable in Stata
+ ValueError
+ * Columns listed in convert_dates are neither datetime64[ns]
+ or datetime.datetime
+ * Column listed in convert_dates is not in DataFrame
+ * Categorical label contains more than 32,000 characters
+
+ .. versionadded:: 0.19.0
+
+ See Also
+ --------
+ read_stata : Import Stata data files.
+ io.stata.StataWriter : Low-level writer for Stata data files.
+ io.stata.StataWriter117 : Low-level writer for version 117 files.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',
+ ... 'parrot'],
+ ... 'speed': [350, 18, 361, 15]})
+ >>> df.to_stata('animals.dta') # doctest: +SKIP
+ """
+ kwargs = {}
+ if version not in (114, 117):
+ raise ValueError("Only formats 114 and 117 supported.")
+ if version == 114:
+ if convert_strl is not None:
+ raise ValueError(
+ "strl support is only available when using " "format 117"
+ )
+ from pandas.io.stata import StataWriter as statawriter
+ else:
+ from pandas.io.stata import StataWriter117 as statawriter
+
+ kwargs["convert_strl"] = convert_strl
+
+ writer = statawriter(
+ fname,
+ self,
+ convert_dates=convert_dates,
+ byteorder=byteorder,
+ time_stamp=time_stamp,
+ data_label=data_label,
+ write_index=write_index,
+ variable_labels=variable_labels,
+ **kwargs
+ )
+ writer.write_file()
+
+ def to_feather(self, fname):
+ """
+ Write out the binary feather-format for DataFrames.
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ fname : str
+ string file path
+ """
+ from pandas.io.feather_format import to_feather
+
+ to_feather(self, fname)
+
+ def to_parquet(
+ self,
+ fname,
+ engine="auto",
+ compression="snappy",
+ index=None,
+ partition_cols=None,
+ **kwargs
+ ):
+ """
+ Write a DataFrame to the binary parquet format.
+
+ .. versionadded:: 0.21.0
+
+ This function writes the dataframe as a `parquet file
+ <https://parquet.apache.org/>`_. You can choose different parquet
+ backends, and have the option of compression. See
+ :ref:`the user guide <io.parquet>` for more details.
+
+ Parameters
+ ----------
+ fname : str
+ File path or Root Directory path. Will be used as Root Directory
+ path while writing a partitioned dataset.
+
+ .. versionchanged:: 0.24.0
+
+ engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+ Parquet library to use. If 'auto', then the option
+ ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+ behavior is to try 'pyarrow', falling back to 'fastparquet' if
+ 'pyarrow' is unavailable.
+ compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
+ Name of the compression to use. Use ``None`` for no compression.
+ index : bool, default None
+ If ``True``, include the dataframe's index(es) in the file output.
+ If ``False``, they will not be written to the file. If ``None``,
+ the behavior depends on the chosen engine.
+
+ .. versionadded:: 0.24.0
+
+ partition_cols : list, optional, default None
+ Column names by which to partition the dataset
+ Columns are partitioned in the order they are given
+
+ .. versionadded:: 0.24.0
+
+ **kwargs
+ Additional arguments passed to the parquet library. See
+ :ref:`pandas io <io.parquet>` for more details.
+
+ See Also
+ --------
+ read_parquet : Read a parquet file.
+ DataFrame.to_csv : Write a csv file.
+ DataFrame.to_sql : Write to a sql table.
+ DataFrame.to_hdf : Write to hdf.
+
+ Notes
+ -----
+ This function requires either the `fastparquet
+ <https://pypi.org/project/fastparquet>`_ or `pyarrow
+ <https://arrow.apache.org/docs/python/>`_ library.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
+ >>> df.to_parquet('df.parquet.gzip',
+ ... compression='gzip') # doctest: +SKIP
+ >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
+ col1 col2
+ 0 1 3
+ 1 2 4
+ """
+ from pandas.io.parquet import to_parquet
+
+ to_parquet(
+ self,
+ fname,
+ engine,
+ compression=compression,
+ index=index,
+ partition_cols=partition_cols,
+ **kwargs
+ )
+
+ @Substitution(
+ header="Whether to print column labels, default True",
+ col_space_type="str or int",
+ col_space="The minimum width of each column in CSS length "
+ "units. An int is assumed to be px units.\n\n"
+ " .. versionadded:: 0.25.0\n"
+ " Ability to use str",
+ )
+ @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
+ def to_html(
+ self,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="NaN",
+ formatters=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ justify=None,
+ max_rows=None,
+ max_cols=None,
+ show_dimensions=False,
+ decimal=".",
+ bold_rows=True,
+ classes=None,
+ escape=True,
+ notebook=False,
+ border=None,
+ table_id=None,
+ render_links=False,
+ ):
+ """
+ Render a DataFrame as an HTML table.
+ %(shared_params)s
+ bold_rows : bool, default True
+ Make the row labels bold in the output.
+ classes : str or list or tuple, default None
+ CSS class(es) to apply to the resulting html table.
+ escape : bool, default True
+ Convert the characters <, >, and & to HTML-safe sequences.
+ notebook : {True, False}, default False
+ Whether the generated HTML is for IPython Notebook.
+ border : int
+ A ``border=border`` attribute is included in the opening
+ `<table>` tag. Default ``pd.options.display.html.border``.
+
+ .. versionadded:: 0.19.0
+
+ table_id : str, optional
+ A css id is included in the opening `<table>` tag if specified.
+
+ .. versionadded:: 0.23.0
+
+ render_links : bool, default False
+ Convert URLs to HTML links.
+
+ .. versionadded:: 0.24.0
+ %(returns)s
+ See Also
+ --------
+ to_string : Convert DataFrame to a string.
+ """
+
+ if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
+ raise ValueError("Invalid value for justify parameter")
+
+ formatter = fmt.DataFrameFormatter(
+ self,
+ buf=buf,
+ columns=columns,
+ col_space=col_space,
+ na_rep=na_rep,
+ formatters=formatters,
+ float_format=float_format,
+ sparsify=sparsify,
+ justify=justify,
+ index_names=index_names,
+ header=header,
+ index=index,
+ bold_rows=bold_rows,
+ escape=escape,
+ max_rows=max_rows,
+ max_cols=max_cols,
+ show_dimensions=show_dimensions,
+ decimal=decimal,
+ table_id=table_id,
+ render_links=render_links,
+ )
+ # TODO: a generic formatter wld b in DataFrameFormatter
+ formatter.to_html(classes=classes, notebook=notebook, border=border)
+
+ if buf is None:
+ return formatter.buf.getvalue()
+
+ # ----------------------------------------------------------------------
+
+ def info(
+ self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None
+ ):
+ """
+ Print a concise summary of a DataFrame.
+
+ This method prints information about a DataFrame including
+ the index dtype and column dtypes, non-null values and memory usage.
+
+ Parameters
+ ----------
+ verbose : bool, optional
+ Whether to print the full summary. By default, the setting in
+ ``pandas.options.display.max_info_columns`` is followed.
+ buf : writable buffer, defaults to sys.stdout
+ Where to send the output. By default, the output is printed to
+ sys.stdout. Pass a writable buffer if you need to further process
+ the output.
+ max_cols : int, optional
+ When to switch from the verbose to the truncated output. If the
+ DataFrame has more than `max_cols` columns, the truncated output
+ is used. By default, the setting in
+ ``pandas.options.display.max_info_columns`` is used.
+ memory_usage : bool, str, optional
+ Specifies whether total memory usage of the DataFrame
+ elements (including the index) should be displayed. By default,
+ this follows the ``pandas.options.display.memory_usage`` setting.
+
+ True always show memory usage. False never shows memory usage.
+ A value of 'deep' is equivalent to "True with deep introspection".
+ Memory usage is shown in human-readable units (base-2
+ representation). Without deep introspection a memory estimation is
+ made based in column dtype and number of rows assuming values
+ consume the same memory amount for corresponding dtypes. With deep
+ memory introspection, a real memory usage calculation is performed
+ at the cost of computational resources.
+ null_counts : bool, optional
+ Whether to show the non-null counts. By default, this is shown
+ only if the frame is smaller than
+ ``pandas.options.display.max_info_rows`` and
+ ``pandas.options.display.max_info_columns``. A value of True always
+ shows the counts, and False never shows the counts.
+
+ Returns
+ -------
+ None
+ This method prints a summary of a DataFrame and returns None.
+
+ See Also
+ --------
+ DataFrame.describe: Generate descriptive statistics of DataFrame
+ columns.
+ DataFrame.memory_usage: Memory usage of DataFrame columns.
+
+ Examples
+ --------
+ >>> int_values = [1, 2, 3, 4, 5]
+ >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
+ >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
+ >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
+ ... "float_col": float_values})
+ >>> df
+ int_col text_col float_col
+ 0 1 alpha 0.00
+ 1 2 beta 0.25
+ 2 3 gamma 0.50
+ 3 4 delta 0.75
+ 4 5 epsilon 1.00
+
+ Prints information of all columns:
+
+ >>> df.info(verbose=True)
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 5 entries, 0 to 4
+ Data columns (total 3 columns):
+ int_col 5 non-null int64
+ text_col 5 non-null object
+ float_col 5 non-null float64
+ dtypes: float64(1), int64(1), object(1)
+ memory usage: 248.0+ bytes
+
+ Prints a summary of columns count and its dtypes but not per column
+ information:
+
+ >>> df.info(verbose=False)
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 5 entries, 0 to 4
+ Columns: 3 entries, int_col to float_col
+ dtypes: float64(1), int64(1), object(1)
+ memory usage: 248.0+ bytes
+
+ Pipe output of DataFrame.info to buffer instead of sys.stdout, get
+ buffer content and writes to a text file:
+
+ >>> import io
+ >>> buffer = io.StringIO()
+ >>> df.info(buf=buffer)
+ >>> s = buffer.getvalue()
+ >>> with open("df_info.txt", "w",
+ ... encoding="utf-8") as f: # doctest: +SKIP
+ ... f.write(s)
+ 260
+
+ The `memory_usage` parameter allows deep introspection mode, specially
+ useful for big DataFrames and fine-tune memory optimization:
+
+ >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
+ >>> df = pd.DataFrame({
+ ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
+ ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
+ ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
+ ... })
+ >>> df.info()
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 1000000 entries, 0 to 999999
+ Data columns (total 3 columns):
+ column_1 1000000 non-null object
+ column_2 1000000 non-null object
+ column_3 1000000 non-null object
+ dtypes: object(3)
+ memory usage: 22.9+ MB
+
+ >>> df.info(memory_usage='deep')
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 1000000 entries, 0 to 999999
+ Data columns (total 3 columns):
+ column_1 1000000 non-null object
+ column_2 1000000 non-null object
+ column_3 1000000 non-null object
+ dtypes: object(3)
+ memory usage: 188.8 MB
+ """
+
+ if buf is None: # pragma: no cover
+ buf = sys.stdout
+
+ lines = []
+
+ lines.append(str(type(self)))
+ lines.append(self.index._summary())
+
+ if len(self.columns) == 0:
+ lines.append("Empty {name}".format(name=type(self).__name__))
+ fmt.buffer_put_lines(buf, lines)
+ return
+
+ cols = self.columns
+
+ # hack
+ if max_cols is None:
+ max_cols = get_option("display.max_info_columns", len(self.columns) + 1)
+
+ max_rows = get_option("display.max_info_rows", len(self) + 1)
+
+ if null_counts is None:
+ show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows)
+ else:
+ show_counts = null_counts
+ exceeds_info_cols = len(self.columns) > max_cols
+
+ def _verbose_repr():
+ lines.append("Data columns (total %d columns):" % len(self.columns))
+ space = max(len(pprint_thing(k)) for k in self.columns) + 4
+ counts = None
+
+ tmpl = "{count}{dtype}"
+ if show_counts:
+ counts = self.count()
+ if len(cols) != len(counts): # pragma: no cover
+ raise AssertionError(
+ "Columns must equal counts "
+ "({cols:d} != {counts:d})".format(
+ cols=len(cols), counts=len(counts)
+ )
+ )
+ tmpl = "{count} non-null {dtype}"
+
+ dtypes = self.dtypes
+ for i, col in enumerate(self.columns):
+ dtype = dtypes.iloc[i]
+ col = pprint_thing(col)
+
+ count = ""
+ if show_counts:
+ count = counts.iloc[i]
+
+ lines.append(
+ _put_str(col, space) + tmpl.format(count=count, dtype=dtype)
+ )
+
+ def _non_verbose_repr():
+ lines.append(self.columns._summary(name="Columns"))
+
+ def _sizeof_fmt(num, size_qualifier):
+ # returns size in human readable format
+ for x in ["bytes", "KB", "MB", "GB", "TB"]:
+ if num < 1024.0:
+ return "{num:3.1f}{size_q} " "{x}".format(
+ num=num, size_q=size_qualifier, x=x
+ )
+ num /= 1024.0
+ return "{num:3.1f}{size_q} {pb}".format(
+ num=num, size_q=size_qualifier, pb="PB"
+ )
+
+ if verbose:
+ _verbose_repr()
+ elif verbose is False: # specifically set to False, not nesc None
+ _non_verbose_repr()
+ else:
+ if exceeds_info_cols:
+ _non_verbose_repr()
+ else:
+ _verbose_repr()
+
+ counts = self._data.get_dtype_counts()
+ dtypes = ["{k}({kk:d})".format(k=k[0], kk=k[1]) for k in sorted(counts.items())]
+ lines.append("dtypes: {types}".format(types=", ".join(dtypes)))
+
+ if memory_usage is None:
+ memory_usage = get_option("display.memory_usage")
+ if memory_usage:
+ # append memory usage of df to display
+ size_qualifier = ""
+ if memory_usage == "deep":
+ deep = True
+ else:
+ # size_qualifier is just a best effort; not guaranteed to catch
+ # all cases (e.g., it misses categorical data even with object
+ # categories)
+ deep = False
+ if "object" in counts or self.index._is_memory_usage_qualified():
+ size_qualifier = "+"
+ mem_usage = self.memory_usage(index=True, deep=deep).sum()
+ lines.append(
+ "memory usage: {mem}\n".format(
+ mem=_sizeof_fmt(mem_usage, size_qualifier)
+ )
+ )
+
+ fmt.buffer_put_lines(buf, lines)
+
+ def memory_usage(self, index=True, deep=False):
+ """
+ Return the memory usage of each column in bytes.
+
+ The memory usage can optionally include the contribution of
+ the index and elements of `object` dtype.
+
+ This value is displayed in `DataFrame.info` by default. This can be
+ suppressed by setting ``pandas.options.display.memory_usage`` to False.
+
+ Parameters
+ ----------
+ index : bool, default True
+ Specifies whether to include the memory usage of the DataFrame's
+ index in returned Series. If ``index=True``, the memory usage of
+ the index is the first item in the output.
+ deep : bool, default False
+ If True, introspect the data deeply by interrogating
+ `object` dtypes for system-level memory consumption, and include
+ it in the returned values.
+
+ Returns
+ -------
+ Series
+ A Series whose index is the original column names and whose values
+ is the memory usage of each column in bytes.
+
+ See Also
+ --------
+ numpy.ndarray.nbytes : Total bytes consumed by the elements of an
+ ndarray.
+ Series.memory_usage : Bytes consumed by a Series.
+ Categorical : Memory-efficient array for string values with
+ many repeated values.
+ DataFrame.info : Concise summary of a DataFrame.
+
+ Examples
+ --------
+ >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
+ >>> data = dict([(t, np.ones(shape=5000).astype(t))
+ ... for t in dtypes])
+ >>> df = pd.DataFrame(data)
+ >>> df.head()
+ int64 float64 complex128 object bool
+ 0 1 1.0 1.0+0.0j 1 True
+ 1 1 1.0 1.0+0.0j 1 True
+ 2 1 1.0 1.0+0.0j 1 True
+ 3 1 1.0 1.0+0.0j 1 True
+ 4 1 1.0 1.0+0.0j 1 True
+
+ >>> df.memory_usage()
+ Index 128
+ int64 40000
+ float64 40000
+ complex128 80000
+ object 40000
+ bool 5000
+ dtype: int64
+
+ >>> df.memory_usage(index=False)
+ int64 40000
+ float64 40000
+ complex128 80000
+ object 40000
+ bool 5000
+ dtype: int64
+
+ The memory footprint of `object` dtype columns is ignored by default:
+
+ >>> df.memory_usage(deep=True)
+ Index 128
+ int64 40000
+ float64 40000
+ complex128 80000
+ object 160000
+ bool 5000
+ dtype: int64
+
+ Use a Categorical for efficient storage of an object-dtype column with
+ many repeated values.
+
+ >>> df['object'].astype('category').memory_usage(deep=True)
+ 5216
+ """
+ result = Series(
+ [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
+ index=self.columns,
+ )
+ if index:
+ result = Series(self.index.memory_usage(deep=deep), index=["Index"]).append(
+ result
+ )
+ return result
+
+ def transpose(self, *args, **kwargs):
+ """
+ Transpose index and columns.
+
+ Reflect the DataFrame over its main diagonal by writing rows as columns
+ and vice-versa. The property :attr:`.T` is an accessor to the method
+ :meth:`transpose`.
+
+ Parameters
+ ----------
+ copy : bool, default False
+ If True, the underlying data is copied. Otherwise (default), no
+ copy is made if possible.
+ *args, **kwargs
+ Additional keywords have no effect but might be accepted for
+ compatibility with numpy.
+
+ Returns
+ -------
+ DataFrame
+ The transposed DataFrame.
+
+ See Also
+ --------
+ numpy.transpose : Permute the dimensions of a given array.
+
+ Notes
+ -----
+ Transposing a DataFrame with mixed dtypes will result in a homogeneous
+ DataFrame with the `object` dtype. In such a case, a copy of the data
+ is always made.
+
+ Examples
+ --------
+ **Square DataFrame with homogeneous dtype**
+
+ >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
+ >>> df1 = pd.DataFrame(data=d1)
+ >>> df1
+ col1 col2
+ 0 1 3
+ 1 2 4
+
+ >>> df1_transposed = df1.T # or df1.transpose()
+ >>> df1_transposed
+ 0 1
+ col1 1 2
+ col2 3 4
+
+ When the dtype is homogeneous in the original DataFrame, we get a
+ transposed DataFrame with the same dtype:
+
+ >>> df1.dtypes
+ col1 int64
+ col2 int64
+ dtype: object
+ >>> df1_transposed.dtypes
+ 0 int64
+ 1 int64
+ dtype: object
+
+ **Non-square DataFrame with mixed dtypes**
+
+ >>> d2 = {'name': ['Alice', 'Bob'],
+ ... 'score': [9.5, 8],
+ ... 'employed': [False, True],
+ ... 'kids': [0, 0]}
+ >>> df2 = pd.DataFrame(data=d2)
+ >>> df2
+ name score employed kids
+ 0 Alice 9.5 False 0
+ 1 Bob 8.0 True 0
+
+ >>> df2_transposed = df2.T # or df2.transpose()
+ >>> df2_transposed
+ 0 1
+ name Alice Bob
+ score 9.5 8
+ employed False True
+ kids 0 0
+
+ When the DataFrame has mixed dtypes, we get a transposed DataFrame with
+ the `object` dtype:
+
+ >>> df2.dtypes
+ name object
+ score float64
+ employed bool
+ kids int64
+ dtype: object
+ >>> df2_transposed.dtypes
+ 0 object
+ 1 object
+ dtype: object
+ """
+ nv.validate_transpose(args, dict())
+ return super().transpose(1, 0, **kwargs)
+
+ T = property(transpose)
+
+ # ----------------------------------------------------------------------
+ # Picklability
+
+ # legacy pickle formats
+ def _unpickle_frame_compat(self, state): # pragma: no cover
+ if len(state) == 2: # pragma: no cover
+ series, idx = state
+ columns = sorted(series)
+ else:
+ series, cols, idx = state
+ columns = com._unpickle_array(cols)
+
+ index = com._unpickle_array(idx)
+ self._data = self._init_dict(series, index, columns, None)
+
+ def _unpickle_matrix_compat(self, state): # pragma: no cover
+ # old unpickling
+ (vals, idx, cols), object_state = state
+
+ index = com._unpickle_array(idx)
+ dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols), copy=False)
+
+ if object_state is not None:
+ ovals, _, ocols = object_state
+ objects = DataFrame(
+ ovals, index=index, columns=com._unpickle_array(ocols), copy=False
+ )
+
+ dm = dm.join(objects)
+
+ self._data = dm._data
+
+ # ----------------------------------------------------------------------
+ # Getting and setting elements
+
+ def get_value(self, index, col, takeable=False):
+ """
+ Quickly retrieve single value at passed column and index.
+
+ .. deprecated:: 0.21.0
+ Use .at[] or .iat[] accessors instead.
+
+ Parameters
+ ----------
+ index : row label
+ col : column label
+ takeable : interpret the index/col as indexers, default False
+
+ Returns
+ -------
+ scalar
+ """
+
+ warnings.warn(
+ "get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
+ return self._get_value(index, col, takeable=takeable)
+
+ def _get_value(self, index, col, takeable=False):
+
+ if takeable:
+ series = self._iget_item_cache(col)
+ return com.maybe_box_datetimelike(series._values[index])
+
+ series = self._get_item_cache(col)
+ engine = self.index._engine
+
+ try:
+ return engine.get_value(series._values, index)
+ except KeyError:
+ # GH 20629
+ if self.index.nlevels > 1:
+ # partial indexing forbidden
+ raise
+ except (TypeError, ValueError):
+ pass
+
+ # we cannot handle direct indexing
+ # use positional
+ col = self.columns.get_loc(col)
+ index = self.index.get_loc(index)
+ return self._get_value(index, col, takeable=True)
+
+ _get_value.__doc__ = get_value.__doc__
+
+ def set_value(self, index, col, value, takeable=False):
+ """
+ Put single value at passed column and index.
+
+ .. deprecated:: 0.21.0
+ Use .at[] or .iat[] accessors instead.
+
+ Parameters
+ ----------
+ index : row label
+ col : column label
+ value : scalar
+ takeable : interpret the index/col as indexers, default False
+
+ Returns
+ -------
+ DataFrame
+ If label pair is contained, will be reference to calling DataFrame,
+ otherwise a new object.
+ """
+ warnings.warn(
+ "set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
+ return self._set_value(index, col, value, takeable=takeable)
+
+ def _set_value(self, index, col, value, takeable=False):
+ try:
+ if takeable is True:
+ series = self._iget_item_cache(col)
+ return series._set_value(index, value, takeable=True)
+
+ series = self._get_item_cache(col)
+ engine = self.index._engine
+ engine.set_value(series._values, index, value)
+ return self
+ except (KeyError, TypeError):
+
+ # set using a non-recursive method & reset the cache
+ if takeable:
+ self.iloc[index, col] = value
+ else:
+ self.loc[index, col] = value
+ self._item_cache.pop(col, None)
+
+ return self
+
+ _set_value.__doc__ = set_value.__doc__
+
+ def _ixs(self, i: int, axis: int = 0):
+ """
+ Parameters
+ ----------
+ i : int
+ axis : int
+
+ Notes
+ -----
+ If slice passed, the resulting data will be a view.
+ """
+ # irow
+ if axis == 0:
+ label = self.index[i]
+ new_values = self._data.fast_xs(i)
+ if is_scalar(new_values):
+ return new_values
+
+ # if we are a copy, mark as such
+ copy = isinstance(new_values, np.ndarray) and new_values.base is None
+ result = self._constructor_sliced(
+ new_values,
+ index=self.columns,
+ name=self.index[i],
+ dtype=new_values.dtype,
+ )
+ result._set_is_copy(self, copy=copy)
+ return result
+
+ # icol
+ else:
+ label = self.columns[i]
+
+ # if the values returned are not the same length
+ # as the index (iow a not found value), iget returns
+ # a 0-len ndarray. This is effectively catching
+ # a numpy error (as numpy should really raise)
+ values = self._data.iget(i)
+
+ if len(self.index) and not len(values):
+ values = np.array([np.nan] * len(self.index), dtype=object)
+ result = self._box_col_values(values, label)
+
+ # this is a cached value, mark it so
+ result._set_as_cached(label, self)
+
+ return result
+
+ def __getitem__(self, key):
+ key = lib.item_from_zerodim(key)
+ key = com.apply_if_callable(key, self)
+
+ if is_hashable(key):
+ # shortcut if the key is in columns
+ if self.columns.is_unique and key in self.columns:
+ if self.columns.nlevels > 1:
+ return self._getitem_multilevel(key)
+ return self._get_item_cache(key)
+
+ # Do we have a slicer (on rows)?
+ indexer = convert_to_index_sliceable(self, key)
+ if indexer is not None:
+ return self._slice(indexer, axis=0)
+
+ # Do we have a (boolean) DataFrame?
+ if isinstance(key, DataFrame):
+ return self._getitem_frame(key)
+
+ # Do we have a (boolean) 1d indexer?
+ if com.is_bool_indexer(key):
+ return self._getitem_bool_array(key)
+
+ # We are left with two options: a single key, and a collection of keys,
+ # We interpret tuples as collections only for non-MultiIndex
+ is_single_key = isinstance(key, tuple) or not is_list_like(key)
+
+ if is_single_key:
+ if self.columns.nlevels > 1:
+ return self._getitem_multilevel(key)
+ indexer = self.columns.get_loc(key)
+ if is_integer(indexer):
+ indexer = [indexer]
+ else:
+ if is_iterator(key):
+ key = list(key)
+ indexer = self.loc._convert_to_indexer(key, axis=1, raise_missing=True)
+
+ # take() does not accept boolean indexers
+ if getattr(indexer, "dtype", None) == bool:
+ indexer = np.where(indexer)[0]
+
+ data = self.take(indexer, axis=1)
+
+ if is_single_key:
+ # What does looking for a single key in a non-unique index return?
+ # The behavior is inconsistent. It returns a Series, except when
+ # - the key itself is repeated (test on data.shape, #9519), or
+ # - we have a MultiIndex on columns (test on self.columns, #21309)
+ if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
+ data = data[key]
+
+ return data
+
+ def _getitem_bool_array(self, key):
+ # also raises Exception if object array with NA values
+ # warning here just in case -- previously __setitem__ was
+ # reindexing but __getitem__ was not; it seems more reasonable to
+ # go with the __setitem__ behavior since that is more consistent
+ # with all other indexing behavior
+ if isinstance(key, Series) and not key.index.equals(self.index):
+ warnings.warn(
+ "Boolean Series key will be reindexed to match " "DataFrame index.",
+ UserWarning,
+ stacklevel=3,
+ )
+ elif len(key) != len(self.index):
+ raise ValueError(
+ "Item wrong length %d instead of %d." % (len(key), len(self.index))
+ )
+
+ # check_bool_indexer will throw exception if Series key cannot
+ # be reindexed to match DataFrame rows
+ key = check_bool_indexer(self.index, key)
+ indexer = key.nonzero()[0]
+ return self.take(indexer, axis=0)
+
+ def _getitem_multilevel(self, key):
+ loc = self.columns.get_loc(key)
+ if isinstance(loc, (slice, Series, np.ndarray, Index)):
+ new_columns = self.columns[loc]
+ result_columns = maybe_droplevels(new_columns, key)
+ if self._is_mixed_type:
+ result = self.reindex(columns=new_columns)
+ result.columns = result_columns
+ else:
+ new_values = self.values[:, loc]
+ result = self._constructor(
+ new_values, index=self.index, columns=result_columns
+ )
+ result = result.__finalize__(self)
+
+ # If there is only one column being returned, and its name is
+ # either an empty string, or a tuple with an empty string as its
+ # first element, then treat the empty string as a placeholder
+ # and return the column as if the user had provided that empty
+ # string in the key. If the result is a Series, exclude the
+ # implied empty string from its name.
+ if len(result.columns) == 1:
+ top = result.columns[0]
+ if isinstance(top, tuple):
+ top = top[0]
+ if top == "":
+ result = result[""]
+ if isinstance(result, Series):
+ result = self._constructor_sliced(
+ result, index=self.index, name=key
+ )
+
+ result._set_is_copy(self)
+ return result
+ else:
+ return self._get_item_cache(key)
+
+ def _getitem_frame(self, key):
+ if key.values.size and not is_bool_dtype(key.values):
+ raise ValueError("Must pass DataFrame with boolean values only")
+ return self.where(key)
+
+ def query(self, expr, inplace=False, **kwargs):
+ """
+ Query the columns of a DataFrame with a boolean expression.
+
+ Parameters
+ ----------
+ expr : str
+ The query string to evaluate. You can refer to variables
+ in the environment by prefixing them with an '@' character like
+ ``@a + b``.
+
+ .. versionadded:: 0.25.0
+
+ You can refer to column names that contain spaces by surrounding
+ them in backticks.
+
+ For example, if one of your columns is called ``a a`` and you want
+ to sum it with ``b``, your query should be ```a a` + b``.
+
+ inplace : bool
+ Whether the query should modify the data in place or return
+ a modified copy.
+ **kwargs
+ See the documentation for :func:`eval` for complete details
+ on the keyword arguments accepted by :meth:`DataFrame.query`.
+
+ .. versionadded:: 0.18.0
+
+ Returns
+ -------
+ DataFrame
+ DataFrame resulting from the provided query expression.
+
+ See Also
+ --------
+ eval : Evaluate a string describing operations on
+ DataFrame columns.
+ DataFrame.eval : Evaluate a string describing operations on
+ DataFrame columns.
+
+ Notes
+ -----
+ The result of the evaluation of this expression is first passed to
+ :attr:`DataFrame.loc` and if that fails because of a
+ multidimensional key (e.g., a DataFrame) then the result will be passed
+ to :meth:`DataFrame.__getitem__`.
+
+ This method uses the top-level :func:`eval` function to
+ evaluate the passed query.
+
+ The :meth:`~pandas.DataFrame.query` method uses a slightly
+ modified Python syntax by default. For example, the ``&`` and ``|``
+ (bitwise) operators have the precedence of their boolean cousins,
+ :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
+ however the semantics are different.
+
+ You can change the semantics of the expression by passing the keyword
+ argument ``parser='python'``. This enforces the same semantics as
+ evaluation in Python space. Likewise, you can pass ``engine='python'``
+ to evaluate an expression using Python itself as a backend. This is not
+ recommended as it is inefficient compared to using ``numexpr`` as the
+ engine.
+
+ The :attr:`DataFrame.index` and
+ :attr:`DataFrame.columns` attributes of the
+ :class:`~pandas.DataFrame` instance are placed in the query namespace
+ by default, which allows you to treat both the index and columns of the
+ frame as a column in the frame.
+ The identifier ``index`` is used for the frame index; you can also
+ use the name of the index to identify it in a query. Please note that
+ Python keywords may not be used as identifiers.
+
+ For further details and examples see the ``query`` documentation in
+ :ref:`indexing <indexing.query>`.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': range(1, 6),
+ ... 'B': range(10, 0, -2),
+ ... 'C C': range(10, 5, -1)})
+ >>> df
+ A B C C
+ 0 1 10 10
+ 1 2 8 9
+ 2 3 6 8
+ 3 4 4 7
+ 4 5 2 6
+ >>> df.query('A > B')
+ A B C C
+ 4 5 2 6
+
+ The previous expression is equivalent to
+
+ >>> df[df.A > df.B]
+ A B C C
+ 4 5 2 6
+
+ For columns with spaces in their name, you can use backtick quoting.
+
+ >>> df.query('B == `C C`')
+ A B C C
+ 0 1 10 10
+
+ The previous expression is equivalent to
+
+ >>> df[df.B == df['C C']]
+ A B C C
+ 0 1 10 10
+ """
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ if not isinstance(expr, str):
+ msg = "expr must be a string to be evaluated, {0} given"
+ raise ValueError(msg.format(type(expr)))
+ kwargs["level"] = kwargs.pop("level", 0) + 1
+ kwargs["target"] = None
+ res = self.eval(expr, **kwargs)
+
+ try:
+ new_data = self.loc[res]
+ except ValueError:
+ # when res is multi-dimensional loc raises, but this is sometimes a
+ # valid query
+ new_data = self[res]
+
+ if inplace:
+ self._update_inplace(new_data)
+ else:
+ return new_data
+
+ def eval(self, expr, inplace=False, **kwargs):
+ """
+ Evaluate a string describing operations on DataFrame columns.
+
+ Operates on columns only, not specific rows or elements. This allows
+ `eval` to run arbitrary code, which can make you vulnerable to code
+ injection if you pass user input to this function.
+
+ Parameters
+ ----------
+ expr : str
+ The expression string to evaluate.
+ inplace : bool, default False
+ If the expression contains an assignment, whether to perform the
+ operation inplace and mutate the existing DataFrame. Otherwise,
+ a new DataFrame is returned.
+
+ .. versionadded:: 0.18.0.
+ kwargs : dict
+ See the documentation for :func:`eval` for complete details
+ on the keyword arguments accepted by
+ :meth:`~pandas.DataFrame.query`.
+
+ Returns
+ -------
+ ndarray, scalar, or pandas object
+ The result of the evaluation.
+
+ See Also
+ --------
+ DataFrame.query : Evaluates a boolean expression to query the columns
+ of a frame.
+ DataFrame.assign : Can evaluate an expression or function to create new
+ values for a column.
+ eval : Evaluate a Python expression as a string using various
+ backends.
+
+ Notes
+ -----
+ For more details see the API documentation for :func:`~eval`.
+ For detailed examples see :ref:`enhancing performance with eval
+ <enhancingperf.eval>`.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
+ >>> df
+ A B
+ 0 1 10
+ 1 2 8
+ 2 3 6
+ 3 4 4
+ 4 5 2
+ >>> df.eval('A + B')
+ 0 11
+ 1 10
+ 2 9
+ 3 8
+ 4 7
+ dtype: int64
+
+ Assignment is allowed though by default the original DataFrame is not
+ modified.
+
+ >>> df.eval('C = A + B')
+ A B C
+ 0 1 10 11
+ 1 2 8 10
+ 2 3 6 9
+ 3 4 4 8
+ 4 5 2 7
+ >>> df
+ A B
+ 0 1 10
+ 1 2 8
+ 2 3 6
+ 3 4 4
+ 4 5 2
+
+ Use ``inplace=True`` to modify the original DataFrame.
+
+ >>> df.eval('C = A + B', inplace=True)
+ >>> df
+ A B C
+ 0 1 10 11
+ 1 2 8 10
+ 2 3 6 9
+ 3 4 4 8
+ 4 5 2 7
+ """
+ from pandas.core.computation.eval import eval as _eval
+
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ resolvers = kwargs.pop("resolvers", None)
+ kwargs["level"] = kwargs.pop("level", 0) + 1
+ if resolvers is None:
+ index_resolvers = self._get_index_resolvers()
+ column_resolvers = self._get_space_character_free_column_resolvers()
+ resolvers = column_resolvers, index_resolvers
+ if "target" not in kwargs:
+ kwargs["target"] = self
+ kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
+ return _eval(expr, inplace=inplace, **kwargs)
+
+ def select_dtypes(self, include=None, exclude=None):
+ """
+ Return a subset of the DataFrame's columns based on the column dtypes.
+
+ Parameters
+ ----------
+ include, exclude : scalar or list-like
+ A selection of dtypes or strings to be included/excluded. At least
+ one of these parameters must be supplied.
+
+ Returns
+ -------
+ DataFrame
+ The subset of the frame including the dtypes in ``include`` and
+ excluding the dtypes in ``exclude``.
+
+ Raises
+ ------
+ ValueError
+ * If both of ``include`` and ``exclude`` are empty
+ * If ``include`` and ``exclude`` have overlapping elements
+ * If any kind of string dtype is passed in.
+
+ Notes
+ -----
+ * To select all *numeric* types, use ``np.number`` or ``'number'``
+ * To select strings you must use the ``object`` dtype, but note that
+ this will return *all* object dtype columns
+ * See the `numpy dtype hierarchy
+ <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
+ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
+ ``'datetime64'``
+ * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
+ ``'timedelta64'``
+ * To select Pandas categorical dtypes, use ``'category'``
+ * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
+ 0.20.0) or ``'datetime64[ns, tz]'``
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'a': [1, 2] * 3,
+ ... 'b': [True, False] * 3,
+ ... 'c': [1.0, 2.0] * 3})
+ >>> df
+ a b c
+ 0 1 True 1.0
+ 1 2 False 2.0
+ 2 1 True 1.0
+ 3 2 False 2.0
+ 4 1 True 1.0
+ 5 2 False 2.0
+
+ >>> df.select_dtypes(include='bool')
+ b
+ 0 True
+ 1 False
+ 2 True
+ 3 False
+ 4 True
+ 5 False
+
+ >>> df.select_dtypes(include=['float64'])
+ c
+ 0 1.0
+ 1 2.0
+ 2 1.0
+ 3 2.0
+ 4 1.0
+ 5 2.0
+
+ >>> df.select_dtypes(exclude=['int'])
+ b c
+ 0 True 1.0
+ 1 False 2.0
+ 2 True 1.0
+ 3 False 2.0
+ 4 True 1.0
+ 5 False 2.0
+ """
+
+ def _get_info_slice(obj, indexer):
+ """Slice the info axis of `obj` with `indexer`."""
+ if not hasattr(obj, "_info_axis_number"):
+ msg = "object of type {typ!r} has no info axis"
+ raise TypeError(msg.format(typ=type(obj).__name__))
+ slices = [slice(None)] * obj.ndim
+ slices[obj._info_axis_number] = indexer
+ return tuple(slices)
+
+ if not is_list_like(include):
+ include = (include,) if include is not None else ()
+ if not is_list_like(exclude):
+ exclude = (exclude,) if exclude is not None else ()
+
+ selection = tuple(map(frozenset, (include, exclude)))
+
+ if not any(selection):
+ raise ValueError("at least one of include or exclude must be " "nonempty")
+
+ # convert the myriad valid dtypes object to a single representation
+ include, exclude = map(
+ lambda x: frozenset(map(infer_dtype_from_object, x)), selection
+ )
+ for dtypes in (include, exclude):
+ invalidate_string_dtypes(dtypes)
+
+ # can't both include AND exclude!
+ if not include.isdisjoint(exclude):
+ raise ValueError(
+ "include and exclude overlap on {inc_ex}".format(
+ inc_ex=(include & exclude)
+ )
+ )
+
+ # empty include/exclude -> defaults to True
+ # three cases (we've already raised if both are empty)
+ # case 1: empty include, nonempty exclude
+ # we have True, True, ... True for include, same for exclude
+ # in the loop below we get the excluded
+ # and when we call '&' below we get only the excluded
+ # case 2: nonempty include, empty exclude
+ # same as case 1, but with include
+ # case 3: both nonempty
+ # the "union" of the logic of case 1 and case 2:
+ # we get the included and excluded, and return their logical and
+ include_these = Series(not bool(include), index=self.columns)
+ exclude_these = Series(not bool(exclude), index=self.columns)
+
+ def is_dtype_instance_mapper(idx, dtype):
+ return idx, functools.partial(issubclass, dtype.type)
+
+ for idx, f in itertools.starmap(
+ is_dtype_instance_mapper, enumerate(self.dtypes)
+ ):
+ if include: # checks for the case of empty include or exclude
+ include_these.iloc[idx] = any(map(f, include))
+ if exclude:
+ exclude_these.iloc[idx] = not any(map(f, exclude))
+
+ dtype_indexer = include_these & exclude_these
+ return self.loc[_get_info_slice(self, dtype_indexer)]
+
+ def _box_item_values(self, key, values):
+ items = self.columns[self.columns.get_loc(key)]
+ if values.ndim == 2:
+ return self._constructor(values.T, columns=items, index=self.index)
+ else:
+ return self._box_col_values(values, items)
+
+ def _box_col_values(self, values, items):
+ """
+ Provide boxed values for a column.
+ """
+ klass = self._constructor_sliced
+ return klass(values, index=self.index, name=items, fastpath=True)
+
+ def __setitem__(self, key, value):
+ key = com.apply_if_callable(key, self)
+
+ # see if we can slice the rows
+ indexer = convert_to_index_sliceable(self, key)
+ if indexer is not None:
+ return self._setitem_slice(indexer, value)
+
+ if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
+ self._setitem_frame(key, value)
+ elif isinstance(key, (Series, np.ndarray, list, Index)):
+ self._setitem_array(key, value)
+ else:
+ # set column
+ self._set_item(key, value)
+
+ def _setitem_slice(self, key, value):
+ self._check_setitem_copy()
+ self.loc._setitem_with_indexer(key, value)
+
+ def _setitem_array(self, key, value):
+ # also raises Exception if object array with NA values
+ if com.is_bool_indexer(key):
+ if len(key) != len(self.index):
+ raise ValueError(
+ "Item wrong length %d instead of %d!" % (len(key), len(self.index))
+ )
+ key = check_bool_indexer(self.index, key)
+ indexer = key.nonzero()[0]
+ self._check_setitem_copy()
+ self.loc._setitem_with_indexer(indexer, value)
+ else:
+ if isinstance(value, DataFrame):
+ if len(value.columns) != len(key):
+ raise ValueError("Columns must be same length as key")
+ for k1, k2 in zip(key, value.columns):
+ self[k1] = value[k2]
+ else:
+ indexer = self.loc._convert_to_indexer(key, axis=1)
+ self._check_setitem_copy()
+ self.loc._setitem_with_indexer((slice(None), indexer), value)
+
+ def _setitem_frame(self, key, value):
+ # support boolean setting with DataFrame input, e.g.
+ # df[df > df2] = 0
+ if isinstance(key, np.ndarray):
+ if key.shape != self.shape:
+ raise ValueError("Array conditional must be same shape as self")
+ key = self._constructor(key, **self._construct_axes_dict())
+
+ if key.values.size and not is_bool_dtype(key.values):
+ raise TypeError(
+ "Must pass DataFrame or 2-d ndarray with boolean values only"
+ )
+
+ self._check_inplace_setting(value)
+ self._check_setitem_copy()
+ self._where(-key, value, inplace=True)
+
+ def _ensure_valid_index(self, value):
+ """
+ Ensure that if we don't have an index, that we can create one from the
+ passed value.
+ """
+ # GH5632, make sure that we are a Series convertible
+ if not len(self.index) and is_list_like(value):
+ try:
+ value = Series(value)
+ except (ValueError, NotImplementedError, TypeError):
+ raise ValueError(
+ "Cannot set a frame with no defined index "
+ "and a value that cannot be converted to a "
+ "Series"
+ )
+
+ self._data = self._data.reindex_axis(
+ value.index.copy(), axis=1, fill_value=np.nan
+ )
+
+ def _set_item(self, key, value):
+ """
+ Add series to DataFrame in specified column.
+
+ If series is a numpy-array (not a Series/TimeSeries), it must be the
+ same length as the DataFrames index or an error will be thrown.
+
+ Series/TimeSeries will be conformed to the DataFrames index to
+ ensure homogeneity.
+ """
+
+ self._ensure_valid_index(value)
+ value = self._sanitize_column(key, value)
+ NDFrame._set_item(self, key, value)
+
+ # check if we are modifying a copy
+ # try to set first as we want an invalid
+ # value exception to occur first
+ if len(self):
+ self._check_setitem_copy()
+
+ def insert(self, loc, column, value, allow_duplicates=False):
+ """
+ Insert column into DataFrame at specified location.
+
+ Raises a ValueError if `column` is already contained in the DataFrame,
+ unless `allow_duplicates` is set to True.
+
+ Parameters
+ ----------
+ loc : int
+ Insertion index. Must verify 0 <= loc <= len(columns)
+ column : string, number, or hashable object
+ label of the inserted column
+ value : int, Series, or array-like
+ allow_duplicates : bool, optional
+ """
+ self._ensure_valid_index(value)
+ value = self._sanitize_column(column, value, broadcast=False)
+ self._data.insert(loc, column, value, allow_duplicates=allow_duplicates)
+
+ def assign(self, **kwargs):
+ r"""
+ Assign new columns to a DataFrame.
+
+ Returns a new object with all original columns in addition to new ones.
+ Existing columns that are re-assigned will be overwritten.
+
+ Parameters
+ ----------
+ **kwargs : dict of {str: callable or Series}
+ The column names are keywords. If the values are
+ callable, they are computed on the DataFrame and
+ assigned to the new columns. The callable must not
+ change input DataFrame (though pandas doesn't check it).
+ If the values are not callable, (e.g. a Series, scalar, or array),
+ they are simply assigned.
+
+ Returns
+ -------
+ DataFrame
+ A new DataFrame with the new columns in addition to
+ all the existing columns.
+
+ Notes
+ -----
+ Assigning multiple columns within the same ``assign`` is possible.
+ For Python 3.6 and above, later items in '\*\*kwargs' may refer to
+ newly created or modified columns in 'df'; items are computed and
+ assigned into 'df' in order. For Python 3.5 and below, the order of
+ keyword arguments is not specified, you cannot refer to newly created
+ or modified columns. All items are computed first, and then assigned
+ in alphabetical order.
+
+ .. versionchanged :: 0.23.0
+
+ Keyword argument order is maintained for Python 3.6 and later.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
+ ... index=['Portland', 'Berkeley'])
+ >>> df
+ temp_c
+ Portland 17.0
+ Berkeley 25.0
+
+ Where the value is a callable, evaluated on `df`:
+
+ >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
+ temp_c temp_f
+ Portland 17.0 62.6
+ Berkeley 25.0 77.0
+
+ Alternatively, the same behavior can be achieved by directly
+ referencing an existing Series or sequence:
+
+ >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
+ temp_c temp_f
+ Portland 17.0 62.6
+ Berkeley 25.0 77.0
+
+ In Python 3.6+, you can create multiple columns within the same assign
+ where one of the columns depends on another one defined within the same
+ assign:
+
+ >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
+ ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
+ temp_c temp_f temp_k
+ Portland 17.0 62.6 290.15
+ Berkeley 25.0 77.0 298.15
+ """
+ data = self.copy()
+
+ # >= 3.6 preserve order of kwargs
+ if PY36:
+ for k, v in kwargs.items():
+ data[k] = com.apply_if_callable(v, data)
+ else:
+ # <= 3.5: do all calculations first...
+ results = OrderedDict()
+ for k, v in kwargs.items():
+ results[k] = com.apply_if_callable(v, data)
+
+ # <= 3.5 and earlier
+ results = sorted(results.items())
+ # ... and then assign
+ for k, v in results:
+ data[k] = v
+ return data
+
+ def _sanitize_column(self, key, value, broadcast=True):
+ """
+ Ensures new columns (which go into the BlockManager as new blocks) are
+ always copied and converted into an array.
+
+ Parameters
+ ----------
+ key : object
+ value : scalar, Series, or array-like
+ broadcast : bool, default True
+ If ``key`` matches multiple duplicate column names in the
+ DataFrame, this parameter indicates whether ``value`` should be
+ tiled so that the returned array contains a (duplicated) column for
+ each occurrence of the key. If False, ``value`` will not be tiled.
+
+ Returns
+ -------
+ numpy.ndarray
+ """
+
+ def reindexer(value):
+ # reindex if necessary
+
+ if value.index.equals(self.index) or not len(self.index):
+ value = value._values.copy()
+ else:
+
+ # GH 4107
+ try:
+ value = value.reindex(self.index)._values
+ except Exception as e:
+
+ # duplicate axis
+ if not value.index.is_unique:
+ raise e
+
+ # other
+ raise TypeError(
+ "incompatible index of inserted column " "with frame index"
+ )
+ return value
+
+ if isinstance(value, Series):
+ value = reindexer(value)
+
+ elif isinstance(value, DataFrame):
+ # align right-hand-side columns if self.columns
+ # is multi-index and self[key] is a sub-frame
+ if isinstance(self.columns, MultiIndex) and key in self.columns:
+ loc = self.columns.get_loc(key)
+ if isinstance(loc, (slice, Series, np.ndarray, Index)):
+ cols = maybe_droplevels(self.columns[loc], key)
+ if len(cols) and not cols.equals(value.columns):
+ value = value.reindex(cols, axis=1)
+ # now align rows
+ value = reindexer(value).T
+
+ elif isinstance(value, ExtensionArray):
+ # Explicitly copy here, instead of in sanitize_index,
+ # as sanitize_index won't copy an EA, even with copy=True
+ value = value.copy()
+ value = sanitize_index(value, self.index, copy=False)
+
+ elif isinstance(value, Index) or is_sequence(value):
+
+ # turn me into an ndarray
+ value = sanitize_index(value, self.index, copy=False)
+ if not isinstance(value, (np.ndarray, Index)):
+ if isinstance(value, list) and len(value) > 0:
+ value = maybe_convert_platform(value)
+ else:
+ value = com.asarray_tuplesafe(value)
+ elif value.ndim == 2:
+ value = value.copy().T
+ elif isinstance(value, Index):
+ value = value.copy(deep=True)
+ else:
+ value = value.copy()
+
+ # possibly infer to datetimelike
+ if is_object_dtype(value.dtype):
+ value = maybe_infer_to_datetimelike(value)
+
+ else:
+ # cast ignores pandas dtypes. so save the dtype first
+ infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True)
+
+ # upcast
+ value = cast_scalar_to_array(len(self.index), value)
+ value = maybe_cast_to_datetime(value, infer_dtype)
+
+ # return internal types directly
+ if is_extension_type(value) or is_extension_array_dtype(value):
+ return value
+
+ # broadcast across multiple columns if necessary
+ if broadcast and key in self.columns and value.ndim == 1:
+ if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
+ existing_piece = self[key]
+ if isinstance(existing_piece, DataFrame):
+ value = np.tile(value, (len(existing_piece.columns), 1))
+
+ return np.atleast_2d(np.asarray(value))
+
+ @property
+ def _series(self):
+ return {
+ item: Series(self._data.iget(idx), index=self.index, name=item)
+ for idx, item in enumerate(self.columns)
+ }
+
+ def lookup(self, row_labels, col_labels):
+ """
+ Label-based "fancy indexing" function for DataFrame.
+
+ Given equal-length arrays of row and column labels, return an
+ array of the values corresponding to each (row, col) pair.
+
+ Parameters
+ ----------
+ row_labels : sequence
+ The row labels to use for lookup
+ col_labels : sequence
+ The column labels to use for lookup
+
+ Returns
+ -------
+ numpy.ndarray
+
+ Notes
+ -----
+ Akin to::
+
+ result = [df.get_value(row, col)
+ for row, col in zip(row_labels, col_labels)]
+
+ Examples
+ --------
+ values : ndarray
+ The found values
+ """
+ n = len(row_labels)
+ if n != len(col_labels):
+ raise ValueError("Row labels must have same size as column labels")
+
+ thresh = 1000
+ if not self._is_mixed_type or n > thresh:
+ values = self.values
+ ridx = self.index.get_indexer(row_labels)
+ cidx = self.columns.get_indexer(col_labels)
+ if (ridx == -1).any():
+ raise KeyError("One or more row labels was not found")
+ if (cidx == -1).any():
+ raise KeyError("One or more column labels was not found")
+ flat_index = ridx * len(self.columns) + cidx
+ result = values.flat[flat_index]
+ else:
+ result = np.empty(n, dtype="O")
+ for i, (r, c) in enumerate(zip(row_labels, col_labels)):
+ result[i] = self._get_value(r, c)
+
+ if is_object_dtype(result):
+ result = lib.maybe_convert_objects(result)
+
+ return result
+
+ # ----------------------------------------------------------------------
+ # Reindexing and alignment
+
+ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
+ frame = self
+
+ columns = axes["columns"]
+ if columns is not None:
+ frame = frame._reindex_columns(
+ columns, method, copy, level, fill_value, limit, tolerance
+ )
+
+ index = axes["index"]
+ if index is not None:
+ frame = frame._reindex_index(
+ index, method, copy, level, fill_value, limit, tolerance
+ )
+
+ return frame
+
+ def _reindex_index(
+ self,
+ new_index,
+ method,
+ copy,
+ level,
+ fill_value=np.nan,
+ limit=None,
+ tolerance=None,
+ ):
+ new_index, indexer = self.index.reindex(
+ new_index, method=method, level=level, limit=limit, tolerance=tolerance
+ )
+ return self._reindex_with_indexers(
+ {0: [new_index, indexer]},
+ copy=copy,
+ fill_value=fill_value,
+ allow_dups=False,
+ )
+
+ def _reindex_columns(
+ self,
+ new_columns,
+ method,
+ copy,
+ level,
+ fill_value=None,
+ limit=None,
+ tolerance=None,
+ ):
+ new_columns, indexer = self.columns.reindex(
+ new_columns, method=method, level=level, limit=limit, tolerance=tolerance
+ )
+ return self._reindex_with_indexers(
+ {1: [new_columns, indexer]},
+ copy=copy,
+ fill_value=fill_value,
+ allow_dups=False,
+ )
+
+ def _reindex_multi(self, axes, copy, fill_value):
+ """
+ We are guaranteed non-Nones in the axes.
+ """
+
+ new_index, row_indexer = self.index.reindex(axes["index"])
+ new_columns, col_indexer = self.columns.reindex(axes["columns"])
+
+ if row_indexer is not None and col_indexer is not None:
+ indexer = row_indexer, col_indexer
+ new_values = algorithms.take_2d_multi(
+ self.values, indexer, fill_value=fill_value
+ )
+ return self._constructor(new_values, index=new_index, columns=new_columns)
+ else:
+ return self._reindex_with_indexers(
+ {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
+ copy=copy,
+ fill_value=fill_value,
+ )
+
+ @Appender(_shared_docs["align"] % _shared_doc_kwargs)
+ def align(
+ self,
+ other,
+ join="outer",
+ axis=None,
+ level=None,
+ copy=True,
+ fill_value=None,
+ method=None,
+ limit=None,
+ fill_axis=0,
+ broadcast_axis=None,
+ ):
+ return super().align(
+ other,
+ join=join,
+ axis=axis,
+ level=level,
+ copy=copy,
+ fill_value=fill_value,
+ method=method,
+ limit=limit,
+ fill_axis=fill_axis,
+ broadcast_axis=broadcast_axis,
+ )
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.reindex.__doc__)
+ @rewrite_axis_style_signature(
+ "labels",
+ [
+ ("method", None),
+ ("copy", True),
+ ("level", None),
+ ("fill_value", np.nan),
+ ("limit", None),
+ ("tolerance", None),
+ ],
+ )
+ def reindex(self, *args, **kwargs):
+ axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex")
+ kwargs.update(axes)
+ # Pop these, since the values are in `kwargs` under different names
+ kwargs.pop("axis", None)
+ kwargs.pop("labels", None)
+ return super().reindex(**kwargs)
+
+ def drop(
+ self,
+ labels=None,
+ axis=0,
+ index=None,
+ columns=None,
+ level=None,
+ inplace=False,
+ errors="raise",
+ ):
+ """
+ Drop specified labels from rows or columns.
+
+ Remove rows or columns by specifying label names and corresponding
+ axis, or by specifying directly index or column names. When using a
+ multi-index, labels on different levels can be removed by specifying
+ the level.
+
+ Parameters
+ ----------
+ labels : single label or list-like
+ Index or column labels to drop.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Whether to drop labels from the index (0 or 'index') or
+ columns (1 or 'columns').
+ index : single label or list-like
+ Alternative to specifying axis (``labels, axis=0``
+ is equivalent to ``index=labels``).
+
+ .. versionadded:: 0.21.0
+ columns : single label or list-like
+ Alternative to specifying axis (``labels, axis=1``
+ is equivalent to ``columns=labels``).
+
+ .. versionadded:: 0.21.0
+ level : int or level name, optional
+ For MultiIndex, level from which the labels will be removed.
+ inplace : bool, default False
+ If True, do operation inplace and return None.
+ errors : {'ignore', 'raise'}, default 'raise'
+ If 'ignore', suppress error and only existing labels are
+ dropped.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame without the removed index or column labels.
+
+ Raises
+ ------
+ KeyError
+ If any of the labels is not found in the selected axis.
+
+ See Also
+ --------
+ DataFrame.loc : Label-location based indexer for selection by label.
+ DataFrame.dropna : Return DataFrame with labels on given axis omitted
+ where (all or any) data are missing.
+ DataFrame.drop_duplicates : Return DataFrame with duplicate rows
+ removed, optionally only considering certain columns.
+ Series.drop : Return Series with specified index labels removed.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
+ ... columns=['A', 'B', 'C', 'D'])
+ >>> df
+ A B C D
+ 0 0 1 2 3
+ 1 4 5 6 7
+ 2 8 9 10 11
+
+ Drop columns
+
+ >>> df.drop(['B', 'C'], axis=1)
+ A D
+ 0 0 3
+ 1 4 7
+ 2 8 11
+
+ >>> df.drop(columns=['B', 'C'])
+ A D
+ 0 0 3
+ 1 4 7
+ 2 8 11
+
+ Drop a row by index
+
+ >>> df.drop([0, 1])
+ A B C D
+ 2 8 9 10 11
+
+ Drop columns and/or rows of MultiIndex DataFrame
+
+ >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
+ ... ['speed', 'weight', 'length']],
+ ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
+ ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
+ >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
+ ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
+ ... [250, 150], [1.5, 0.8], [320, 250],
+ ... [1, 0.8], [0.3, 0.2]])
+ >>> df
+ big small
+ lama speed 45.0 30.0
+ weight 200.0 100.0
+ length 1.5 1.0
+ cow speed 30.0 20.0
+ weight 250.0 150.0
+ length 1.5 0.8
+ falcon speed 320.0 250.0
+ weight 1.0 0.8
+ length 0.3 0.2
+
+ >>> df.drop(index='cow', columns='small')
+ big
+ lama speed 45.0
+ weight 200.0
+ length 1.5
+ falcon speed 320.0
+ weight 1.0
+ length 0.3
+
+ >>> df.drop(index='length', level=1)
+ big small
+ lama speed 45.0 30.0
+ weight 200.0 100.0
+ cow speed 30.0 20.0
+ weight 250.0 150.0
+ falcon speed 320.0 250.0
+ weight 1.0 0.8
+ """
+ return super().drop(
+ labels=labels,
+ axis=axis,
+ index=index,
+ columns=columns,
+ level=level,
+ inplace=inplace,
+ errors=errors,
+ )
+
+ @rewrite_axis_style_signature(
+ "mapper",
+ [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")],
+ )
+ def rename(self, *args, **kwargs):
+ """
+ Alter axes labels.
+
+ Function / dict values must be unique (1-to-1). Labels not contained in
+ a dict / Series will be left as-is. Extra labels listed don't throw an
+ error.
+
+ See the :ref:`user guide <basics.rename>` for more.
+
+ Parameters
+ ----------
+ mapper : dict-like or function
+ Dict-like or functions transformations to apply to
+ that axis' values. Use either ``mapper`` and ``axis`` to
+ specify the axis to target with ``mapper``, or ``index`` and
+ ``columns``.
+ index : dict-like or function
+ Alternative to specifying axis (``mapper, axis=0``
+ is equivalent to ``index=mapper``).
+ columns : dict-like or function
+ Alternative to specifying axis (``mapper, axis=1``
+ is equivalent to ``columns=mapper``).
+ axis : int or str
+ Axis to target with ``mapper``. Can be either the axis name
+ ('index', 'columns') or number (0, 1). The default is 'index'.
+ copy : bool, default True
+ Also copy underlying data.
+ inplace : bool, default False
+ Whether to return a new DataFrame. If True then value of copy is
+ ignored.
+ level : int or level name, default None
+ In case of a MultiIndex, only rename labels in the specified
+ level.
+ errors : {'ignore', 'raise'}, default 'ignore'
+ If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
+ or `columns` contains labels that are not present in the Index
+ being transformed.
+ If 'ignore', existing keys will be renamed and extra keys will be
+ ignored.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame with the renamed axis labels.
+
+ Raises
+ ------
+ KeyError
+ If any of the labels is not found in the selected axis and
+ "errors='raise'".
+
+ See Also
+ --------
+ DataFrame.rename_axis : Set the name of the axis.
+
+ Examples
+ --------
+
+ ``DataFrame.rename`` supports two calling conventions
+
+ * ``(index=index_mapper, columns=columns_mapper, ...)``
+ * ``(mapper, axis={'index', 'columns'}, ...)``
+
+ We *highly* recommend using keyword arguments to clarify your
+ intent.
+
+ Rename columns using a mapping:
+
+ >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+ >>> df.rename(columns={"A": "a", "B": "c"})
+ a c
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ Rename index using a mapping:
+
+ >>> df.rename(index={0: "x", 1: "y", 2: "z"})
+ A B
+ x 1 4
+ y 2 5
+ z 3 6
+
+ Cast index labels to a different type:
+
+ >>> df.index
+ RangeIndex(start=0, stop=3, step=1)
+ >>> df.rename(index=str).index
+ Index(['0', '1', '2'], dtype='object')
+
+ >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
+ Traceback (most recent call last):
+ KeyError: ['C'] not found in axis
+
+ Using axis-style parameters
+
+ >>> df.rename(str.lower, axis='columns')
+ a b
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ >>> df.rename({1: 2, 2: 4}, axis='index')
+ A B
+ 0 1 4
+ 2 2 5
+ 4 3 6
+ """
+ axes = validate_axis_style_args(self, args, kwargs, "mapper", "rename")
+ kwargs.update(axes)
+ # Pop these, since the values are in `kwargs` under different names
+ kwargs.pop("axis", None)
+ kwargs.pop("mapper", None)
+ return super().rename(**kwargs)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.fillna.__doc__)
+ def fillna(
+ self,
+ value=None,
+ method=None,
+ axis=None,
+ inplace=False,
+ limit=None,
+ downcast=None,
+ **kwargs
+ ):
+ return super().fillna(
+ value=value,
+ method=method,
+ axis=axis,
+ inplace=inplace,
+ limit=limit,
+ downcast=downcast,
+ **kwargs
+ )
+
+ @Appender(_shared_docs["replace"] % _shared_doc_kwargs)
+ def replace(
+ self,
+ to_replace=None,
+ value=None,
+ inplace=False,
+ limit=None,
+ regex=False,
+ method="pad",
+ ):
+ return super().replace(
+ to_replace=to_replace,
+ value=value,
+ inplace=inplace,
+ limit=limit,
+ regex=regex,
+ method=method,
+ )
+
+ @Appender(_shared_docs["shift"] % _shared_doc_kwargs)
+ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+ return super().shift(
+ periods=periods, freq=freq, axis=axis, fill_value=fill_value
+ )
+
+ def set_index(
+ self, keys, drop=True, append=False, inplace=False, verify_integrity=False
+ ):
+ """
+ Set the DataFrame index using existing columns.
+
+ Set the DataFrame index (row labels) using one or more existing
+ columns or arrays (of the correct length). The index can replace the
+ existing index or expand on it.
+
+ Parameters
+ ----------
+ keys : label or array-like or list of labels/arrays
+ This parameter can be either a single column key, a single array of
+ the same length as the calling DataFrame, or a list containing an
+ arbitrary combination of column keys and arrays. Here, "array"
+ encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
+ instances of :class:`~collections.abc.Iterator`.
+ drop : bool, default True
+ Delete columns to be used as the new index.
+ append : bool, default False
+ Whether to append columns to existing index.
+ inplace : bool, default False
+ Modify the DataFrame in place (do not create a new object).
+ verify_integrity : bool, default False
+ Check the new index for duplicates. Otherwise defer the check until
+ necessary. Setting to False will improve the performance of this
+ method.
+
+ Returns
+ -------
+ DataFrame
+ Changed row labels.
+
+ See Also
+ --------
+ DataFrame.reset_index : Opposite of set_index.
+ DataFrame.reindex : Change to new indices or expand indices.
+ DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
+ ... 'year': [2012, 2014, 2013, 2014],
+ ... 'sale': [55, 40, 84, 31]})
+ >>> df
+ month year sale
+ 0 1 2012 55
+ 1 4 2014 40
+ 2 7 2013 84
+ 3 10 2014 31
+
+ Set the index to become the 'month' column:
+
+ >>> df.set_index('month')
+ year sale
+ month
+ 1 2012 55
+ 4 2014 40
+ 7 2013 84
+ 10 2014 31
+
+ Create a MultiIndex using columns 'year' and 'month':
+
+ >>> df.set_index(['year', 'month'])
+ sale
+ year month
+ 2012 1 55
+ 2014 4 40
+ 2013 7 84
+ 2014 10 31
+
+ Create a MultiIndex using an Index and a column:
+
+ >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
+ month sale
+ year
+ 1 2012 1 55
+ 2 2014 4 40
+ 3 2013 7 84
+ 4 2014 10 31
+
+ Create a MultiIndex using two Series:
+
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> df.set_index([s, s**2])
+ month year sale
+ 1 1 1 2012 55
+ 2 4 4 2014 40
+ 3 9 7 2013 84
+ 4 16 10 2014 31
+ """
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ if not isinstance(keys, list):
+ keys = [keys]
+
+ err_msg = (
+ 'The parameter "keys" may be a column key, one-dimensional '
+ "array, or a list containing only valid column keys and "
+ "one-dimensional arrays."
+ )
+
+ missing = []
+ for col in keys:
+ if isinstance(
+ col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator)
+ ):
+ # arrays are fine as long as they are one-dimensional
+ # iterators get converted to list below
+ if getattr(col, "ndim", 1) != 1:
+ raise ValueError(err_msg)
+ else:
+ # everything else gets tried as a key; see GH 24969
+ try:
+ found = col in self.columns
+ except TypeError:
+ raise TypeError(
+ err_msg + " Received column of " "type {}".format(type(col))
+ )
+ else:
+ if not found:
+ missing.append(col)
+
+ if missing:
+ raise KeyError("None of {} are in the columns".format(missing))
+
+ if inplace:
+ frame = self
+ else:
+ frame = self.copy()
+
+ arrays = []
+ names = []
+ if append:
+ names = [x for x in self.index.names]
+ if isinstance(self.index, ABCMultiIndex):
+ for i in range(self.index.nlevels):
+ arrays.append(self.index._get_level_values(i))
+ else:
+ arrays.append(self.index)
+
+ to_remove = []
+ for col in keys:
+ if isinstance(col, ABCMultiIndex):
+ for n in range(col.nlevels):
+ arrays.append(col._get_level_values(n))
+ names.extend(col.names)
+ elif isinstance(col, (ABCIndexClass, ABCSeries)):
+ # if Index then not MultiIndex (treated above)
+ arrays.append(col)
+ names.append(col.name)
+ elif isinstance(col, (list, np.ndarray)):
+ arrays.append(col)
+ names.append(None)
+ elif isinstance(col, abc.Iterator):
+ arrays.append(list(col))
+ names.append(None)
+ # from here, col can only be a column label
+ else:
+ arrays.append(frame[col]._values)
+ names.append(col)
+ if drop:
+ to_remove.append(col)
+
+ if len(arrays[-1]) != len(self):
+ # check newest element against length of calling frame, since
+ # ensure_index_from_sequences would not raise for append=False.
+ raise ValueError(
+ "Length mismatch: Expected {len_self} rows, "
+ "received array of length {len_col}".format(
+ len_self=len(self), len_col=len(arrays[-1])
+ )
+ )
+
+ index = ensure_index_from_sequences(arrays, names)
+
+ if verify_integrity and not index.is_unique:
+ duplicates = index[index.duplicated()].unique()
+ raise ValueError("Index has duplicate keys: {dup}".format(dup=duplicates))
+
+ # use set to handle duplicate column names gracefully in case of drop
+ for c in set(to_remove):
+ del frame[c]
+
+ # clear up memory usage
+ index._cleanup()
+
+ frame.index = index
+
+ if not inplace:
+ return frame
+
+ def reset_index(
+ self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
+ ):
+ """
+ Reset the index, or a level of it.
+
+ Reset the index of the DataFrame, and use the default one instead.
+ If the DataFrame has a MultiIndex, this method can remove one or more
+ levels.
+
+ Parameters
+ ----------
+ level : int, str, tuple, or list, default None
+ Only remove the given levels from the index. Removes all levels by
+ default.
+ drop : bool, default False
+ Do not try to insert index into dataframe columns. This resets
+ the index to the default integer index.
+ inplace : bool, default False
+ Modify the DataFrame in place (do not create a new object).
+ col_level : int or str, default 0
+ If the columns have multiple levels, determines which level the
+ labels are inserted into. By default it is inserted into the first
+ level.
+ col_fill : object, default ''
+ If the columns have multiple levels, determines how the other
+ levels are named. If None then the index name is repeated.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame with the new index.
+
+ See Also
+ --------
+ DataFrame.set_index : Opposite of reset_index.
+ DataFrame.reindex : Change to new indices or expand indices.
+ DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([('bird', 389.0),
+ ... ('bird', 24.0),
+ ... ('mammal', 80.5),
+ ... ('mammal', np.nan)],
+ ... index=['falcon', 'parrot', 'lion', 'monkey'],
+ ... columns=('class', 'max_speed'))
+ >>> df
+ class max_speed
+ falcon bird 389.0
+ parrot bird 24.0
+ lion mammal 80.5
+ monkey mammal NaN
+
+ When we reset the index, the old index is added as a column, and a
+ new sequential index is used:
+
+ >>> df.reset_index()
+ index class max_speed
+ 0 falcon bird 389.0
+ 1 parrot bird 24.0
+ 2 lion mammal 80.5
+ 3 monkey mammal NaN
+
+ We can use the `drop` parameter to avoid the old index being added as
+ a column:
+
+ >>> df.reset_index(drop=True)
+ class max_speed
+ 0 bird 389.0
+ 1 bird 24.0
+ 2 mammal 80.5
+ 3 mammal NaN
+
+ You can also use `reset_index` with `MultiIndex`.
+
+ >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
+ ... ('bird', 'parrot'),
+ ... ('mammal', 'lion'),
+ ... ('mammal', 'monkey')],
+ ... names=['class', 'name'])
+ >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
+ ... ('species', 'type')])
+ >>> df = pd.DataFrame([(389.0, 'fly'),
+ ... ( 24.0, 'fly'),
+ ... ( 80.5, 'run'),
+ ... (np.nan, 'jump')],
+ ... index=index,
+ ... columns=columns)
+ >>> df
+ speed species
+ max type
+ class name
+ bird falcon 389.0 fly
+ parrot 24.0 fly
+ mammal lion 80.5 run
+ monkey NaN jump
+
+ If the index has multiple levels, we can reset a subset of them:
+
+ >>> df.reset_index(level='class')
+ class speed species
+ max type
+ name
+ falcon bird 389.0 fly
+ parrot bird 24.0 fly
+ lion mammal 80.5 run
+ monkey mammal NaN jump
+
+ If we are not dropping the index, by default, it is placed in the top
+ level. We can place it in another level:
+
+ >>> df.reset_index(level='class', col_level=1)
+ speed species
+ class max type
+ name
+ falcon bird 389.0 fly
+ parrot bird 24.0 fly
+ lion mammal 80.5 run
+ monkey mammal NaN jump
+
+ When the index is inserted under another level, we can specify under
+ which one with the parameter `col_fill`:
+
+ >>> df.reset_index(level='class', col_level=1, col_fill='species')
+ species speed species
+ class max type
+ name
+ falcon bird 389.0 fly
+ parrot bird 24.0 fly
+ lion mammal 80.5 run
+ monkey mammal NaN jump
+
+ If we specify a nonexistent level for `col_fill`, it is created:
+
+ >>> df.reset_index(level='class', col_level=1, col_fill='genus')
+ genus speed species
+ class max type
+ name
+ falcon bird 389.0 fly
+ parrot bird 24.0 fly
+ lion mammal 80.5 run
+ monkey mammal NaN jump
+ """
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ if inplace:
+ new_obj = self
+ else:
+ new_obj = self.copy()
+
+ def _maybe_casted_values(index, labels=None):
+ values = index._values
+ if not isinstance(index, (PeriodIndex, DatetimeIndex)):
+ if values.dtype == np.object_:
+ values = lib.maybe_convert_objects(values)
+
+ # if we have the labels, extract the values with a mask
+ if labels is not None:
+ mask = labels == -1
+
+ # we can have situations where the whole mask is -1,
+ # meaning there is nothing found in labels, so make all nan's
+ if mask.all():
+ values = np.empty(len(mask))
+ values.fill(np.nan)
+ else:
+ values = values.take(labels)
+
+ # TODO(https://github.com/pandas-dev/pandas/issues/24206)
+ # Push this into maybe_upcast_putmask?
+ # We can't pass EAs there right now. Looks a bit
+ # complicated.
+ # So we unbox the ndarray_values, op, re-box.
+ values_type = type(values)
+ values_dtype = values.dtype
+
+ if issubclass(values_type, DatetimeLikeArray):
+ values = values._data
+
+ if mask.any():
+ values, changed = maybe_upcast_putmask(values, mask, np.nan)
+
+ if issubclass(values_type, DatetimeLikeArray):
+ values = values_type(values, dtype=values_dtype)
+
+ return values
+
+ new_index = ibase.default_index(len(new_obj))
+ if level is not None:
+ if not isinstance(level, (tuple, list)):
+ level = [level]
+ level = [self.index._get_level_number(lev) for lev in level]
+ if len(level) < self.index.nlevels:
+ new_index = self.index.droplevel(level)
+
+ if not drop:
+ if isinstance(self.index, MultiIndex):
+ names = [
+ n if n is not None else ("level_%d" % i)
+ for (i, n) in enumerate(self.index.names)
+ ]
+ to_insert = zip(self.index.levels, self.index.codes)
+ else:
+ default = "index" if "index" not in self else "level_0"
+ names = [default] if self.index.name is None else [self.index.name]
+ to_insert = ((self.index, None),)
+
+ multi_col = isinstance(self.columns, MultiIndex)
+ for i, (lev, lab) in reversed(list(enumerate(to_insert))):
+ if not (level is None or i in level):
+ continue
+ name = names[i]
+ if multi_col:
+ col_name = list(name) if isinstance(name, tuple) else [name]
+ if col_fill is None:
+ if len(col_name) not in (1, self.columns.nlevels):
+ raise ValueError(
+ "col_fill=None is incompatible "
+ "with incomplete column name "
+ "{}".format(name)
+ )
+ col_fill = col_name[0]
+
+ lev_num = self.columns._get_level_number(col_level)
+ name_lst = [col_fill] * lev_num + col_name
+ missing = self.columns.nlevels - len(name_lst)
+ name_lst += [col_fill] * missing
+ name = tuple(name_lst)
+ # to ndarray and maybe infer different dtype
+ level_values = _maybe_casted_values(lev, lab)
+ new_obj.insert(0, name, level_values)
+
+ new_obj.index = new_index
+ if not inplace:
+ return new_obj
+
+ # ----------------------------------------------------------------------
+ # Reindex-based selection methods
+
+ @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
+ def isna(self):
+ return super().isna()
+
+ @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
+ def isnull(self):
+ return super().isnull()
+
+ @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
+ def notna(self):
+ return super().notna()
+
+ @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
+ def notnull(self):
+ return super().notnull()
+
+ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
+ """
+ Remove missing values.
+
+ See the :ref:`User Guide <missing_data>` for more on which values are
+ considered missing, and how to work with missing data.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Determine if rows or columns which contain missing values are
+ removed.
+
+ * 0, or 'index' : Drop rows which contain missing values.
+ * 1, or 'columns' : Drop columns which contain missing value.
+
+ .. deprecated:: 0.23.0
+
+ Pass tuple or list to drop on multiple axes.
+ Only a single axis is allowed.
+
+ how : {'any', 'all'}, default 'any'
+ Determine if row or column is removed from DataFrame, when we have
+ at least one NA or all NA.
+
+ * 'any' : If any NA values are present, drop that row or column.
+ * 'all' : If all values are NA, drop that row or column.
+
+ thresh : int, optional
+ Require that many non-NA values.
+ subset : array-like, optional
+ Labels along other axis to consider, e.g. if you are dropping rows
+ these would be a list of columns to include.
+ inplace : bool, default False
+ If True, do operation inplace and return None.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame with NA entries dropped from it.
+
+ See Also
+ --------
+ DataFrame.isna: Indicate missing values.
+ DataFrame.notna : Indicate existing (non-missing) values.
+ DataFrame.fillna : Replace missing values.
+ Series.dropna : Drop missing values.
+ Index.dropna : Drop missing indices.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
+ ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
+ ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
+ ... pd.NaT]})
+ >>> df
+ name toy born
+ 0 Alfred NaN NaT
+ 1 Batman Batmobile 1940-04-25
+ 2 Catwoman Bullwhip NaT
+
+ Drop the rows where at least one element is missing.
+
+ >>> df.dropna()
+ name toy born
+ 1 Batman Batmobile 1940-04-25
+
+ Drop the columns where at least one element is missing.
+
+ >>> df.dropna(axis='columns')
+ name
+ 0 Alfred
+ 1 Batman
+ 2 Catwoman
+
+ Drop the rows where all elements are missing.
+
+ >>> df.dropna(how='all')
+ name toy born
+ 0 Alfred NaN NaT
+ 1 Batman Batmobile 1940-04-25
+ 2 Catwoman Bullwhip NaT
+
+ Keep only the rows with at least 2 non-NA values.
+
+ >>> df.dropna(thresh=2)
+ name toy born
+ 1 Batman Batmobile 1940-04-25
+ 2 Catwoman Bullwhip NaT
+
+ Define in which columns to look for missing values.
+
+ >>> df.dropna(subset=['name', 'born'])
+ name toy born
+ 1 Batman Batmobile 1940-04-25
+
+ Keep the DataFrame with valid entries in the same variable.
+
+ >>> df.dropna(inplace=True)
+ >>> df
+ name toy born
+ 1 Batman Batmobile 1940-04-25
+ """
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ if isinstance(axis, (tuple, list)):
+ # GH20987
+ msg = (
+ "supplying multiple axes to axis is deprecated and "
+ "will be removed in a future version."
+ )
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+
+ result = self
+ for ax in axis:
+ result = result.dropna(how=how, thresh=thresh, subset=subset, axis=ax)
+ else:
+ axis = self._get_axis_number(axis)
+ agg_axis = 1 - axis
+
+ agg_obj = self
+ if subset is not None:
+ ax = self._get_axis(agg_axis)
+ indices = ax.get_indexer_for(subset)
+ check = indices == -1
+ if check.any():
+ raise KeyError(list(np.compress(check, subset)))
+ agg_obj = self.take(indices, axis=agg_axis)
+
+ count = agg_obj.count(axis=agg_axis)
+
+ if thresh is not None:
+ mask = count >= thresh
+ elif how == "any":
+ mask = count == len(agg_obj._get_axis(agg_axis))
+ elif how == "all":
+ mask = count > 0
+ else:
+ if how is not None:
+ raise ValueError("invalid how option: {h}".format(h=how))
+ else:
+ raise TypeError("must specify how or thresh")
+
+ result = self.loc(axis=axis)[mask]
+
+ if inplace:
+ self._update_inplace(result)
+ else:
+ return result
+
+ def drop_duplicates(self, subset=None, keep="first", inplace=False):
+ """
+ Return DataFrame with duplicate rows removed, optionally only
+ considering certain columns. Indexes, including time indexes
+ are ignored.
+
+ Parameters
+ ----------
+ subset : column label or sequence of labels, optional
+ Only consider certain columns for identifying duplicates, by
+ default use all of the columns
+ keep : {'first', 'last', False}, default 'first'
+ - ``first`` : Drop duplicates except for the first occurrence.
+ - ``last`` : Drop duplicates except for the last occurrence.
+ - False : Drop all duplicates.
+ inplace : boolean, default False
+ Whether to drop duplicates in place or to return a copy
+
+ Returns
+ -------
+ DataFrame
+ """
+ if self.empty:
+ return self.copy()
+
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ duplicated = self.duplicated(subset, keep=keep)
+
+ if inplace:
+ inds, = (-duplicated)._ndarray_values.nonzero()
+ new_data = self._data.take(inds)
+ self._update_inplace(new_data)
+ else:
+ return self[-duplicated]
+
+ def duplicated(self, subset=None, keep="first"):
+ """
+ Return boolean Series denoting duplicate rows, optionally only
+ considering certain columns.
+
+ Parameters
+ ----------
+ subset : column label or sequence of labels, optional
+ Only consider certain columns for identifying duplicates, by
+ default use all of the columns
+ keep : {'first', 'last', False}, default 'first'
+ - ``first`` : Mark duplicates as ``True`` except for the
+ first occurrence.
+ - ``last`` : Mark duplicates as ``True`` except for the
+ last occurrence.
+ - False : Mark all duplicates as ``True``.
+
+ Returns
+ -------
+ Series
+ """
+ from pandas.core.sorting import get_group_index
+ from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
+
+ if self.empty:
+ return Series(dtype=bool)
+
+ def f(vals):
+ labels, shape = algorithms.factorize(
+ vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)
+ )
+ return labels.astype("i8", copy=False), len(shape)
+
+ if subset is None:
+ subset = self.columns
+ elif (
+ not np.iterable(subset)
+ or isinstance(subset, str)
+ or isinstance(subset, tuple)
+ and subset in self.columns
+ ):
+ subset = (subset,)
+
+ # Verify all columns in subset exist in the queried dataframe
+ # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
+ # key that doesn't exist.
+ diff = Index(subset).difference(self.columns)
+ if not diff.empty:
+ raise KeyError(diff)
+
+ vals = (col.values for name, col in self.items() if name in subset)
+ labels, shape = map(list, zip(*map(f, vals)))
+
+ ids = get_group_index(labels, shape, sort=False, xnull=False)
+ return Series(duplicated_int64(ids, keep), index=self.index)
+
+ # ----------------------------------------------------------------------
+ # Sorting
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.sort_values.__doc__)
+ def sort_values(
+ self,
+ by,
+ axis=0,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ ):
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ axis = self._get_axis_number(axis)
+
+ if not isinstance(by, list):
+ by = [by]
+ if is_sequence(ascending) and len(by) != len(ascending):
+ raise ValueError(
+ "Length of ascending (%d) != length of by (%d)"
+ % (len(ascending), len(by))
+ )
+ if len(by) > 1:
+ from pandas.core.sorting import lexsort_indexer
+
+ keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
+ indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position)
+ indexer = ensure_platform_int(indexer)
+ else:
+ from pandas.core.sorting import nargsort
+
+ by = by[0]
+ k = self._get_label_or_level_values(by, axis=axis)
+
+ if isinstance(ascending, (tuple, list)):
+ ascending = ascending[0]
+
+ indexer = nargsort(
+ k, kind=kind, ascending=ascending, na_position=na_position
+ )
+
+ new_data = self._data.take(
+ indexer, axis=self._get_block_manager_axis(axis), verify=False
+ )
+
+ if inplace:
+ return self._update_inplace(new_data)
+ else:
+ return self._constructor(new_data).__finalize__(self)
+
+ @Substitution(**_shared_doc_kwargs)
+ @Appender(NDFrame.sort_index.__doc__)
+ def sort_index(
+ self,
+ axis=0,
+ level=None,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ sort_remaining=True,
+ by=None,
+ ):
+
+ # TODO: this can be combined with Series.sort_index impl as
+ # almost identical
+
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ # 10726
+ if by is not None:
+ warnings.warn(
+ "by argument to sort_index is deprecated, "
+ "please use .sort_values(by=...)",
+ FutureWarning,
+ stacklevel=2,
+ )
+ if level is not None:
+ raise ValueError("unable to simultaneously sort by and level")
+ return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace)
+
+ axis = self._get_axis_number(axis)
+ labels = self._get_axis(axis)
+
+ # make sure that the axis is lexsorted to start
+ # if not we need to reconstruct to get the correct indexer
+ labels = labels._sort_levels_monotonic()
+ if level is not None:
+
+ new_axis, indexer = labels.sortlevel(
+ level, ascending=ascending, sort_remaining=sort_remaining
+ )
+
+ elif isinstance(labels, MultiIndex):
+ from pandas.core.sorting import lexsort_indexer
+
+ indexer = lexsort_indexer(
+ labels._get_codes_for_sorting(),
+ orders=ascending,
+ na_position=na_position,
+ )
+ else:
+ from pandas.core.sorting import nargsort
+
+ # Check monotonic-ness before sort an index
+ # GH11080
+ if (ascending and labels.is_monotonic_increasing) or (
+ not ascending and labels.is_monotonic_decreasing
+ ):
+ if inplace:
+ return
+ else:
+ return self.copy()
+
+ indexer = nargsort(
+ labels, kind=kind, ascending=ascending, na_position=na_position
+ )
+
+ baxis = self._get_block_manager_axis(axis)
+ new_data = self._data.take(indexer, axis=baxis, verify=False)
+
+ # reconstruct axis if needed
+ new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
+
+ if inplace:
+ return self._update_inplace(new_data)
+ else:
+ return self._constructor(new_data).__finalize__(self)
+
+ def nlargest(self, n, columns, keep="first"):
+ """
+ Return the first `n` rows ordered by `columns` in descending order.
+
+ Return the first `n` rows with the largest values in `columns`, in
+ descending order. The columns that are not specified are returned as
+ well, but not used for ordering.
+
+ This method is equivalent to
+ ``df.sort_values(columns, ascending=False).head(n)``, but more
+ performant.
+
+ Parameters
+ ----------
+ n : int
+ Number of rows to return.
+ columns : label or list of labels
+ Column label(s) to order by.
+ keep : {'first', 'last', 'all'}, default 'first'
+ Where there are duplicate values:
+
+ - `first` : prioritize the first occurrence(s)
+ - `last` : prioritize the last occurrence(s)
+ - ``all`` : do not drop any duplicates, even it means
+ selecting more than `n` items.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ DataFrame
+ The first `n` rows ordered by the given columns in descending
+ order.
+
+ See Also
+ --------
+ DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
+ ascending order.
+ DataFrame.sort_values : Sort DataFrame by the values.
+ DataFrame.head : Return the first `n` rows without re-ordering.
+
+ Notes
+ -----
+ This function cannot be used with all column types. For example, when
+ specifying columns with `object` or `category` dtypes, ``TypeError`` is
+ raised.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
+ ... 434000, 434000, 337000, 11300,
+ ... 11300, 11300],
+ ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
+ ... 17036, 182, 38, 311],
+ ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
+ ... "IS", "NR", "TV", "AI"]},
+ ... index=["Italy", "France", "Malta",
+ ... "Maldives", "Brunei", "Iceland",
+ ... "Nauru", "Tuvalu", "Anguilla"])
+ >>> df
+ population GDP alpha-2
+ Italy 59000000 1937894 IT
+ France 65000000 2583560 FR
+ Malta 434000 12011 MT
+ Maldives 434000 4520 MV
+ Brunei 434000 12128 BN
+ Iceland 337000 17036 IS
+ Nauru 11300 182 NR
+ Tuvalu 11300 38 TV
+ Anguilla 11300 311 AI
+
+ In the following example, we will use ``nlargest`` to select the three
+ rows having the largest values in column "population".
+
+ >>> df.nlargest(3, 'population')
+ population GDP alpha-2
+ France 65000000 2583560 FR
+ Italy 59000000 1937894 IT
+ Malta 434000 12011 MT
+
+ When using ``keep='last'``, ties are resolved in reverse order:
+
+ >>> df.nlargest(3, 'population', keep='last')
+ population GDP alpha-2
+ France 65000000 2583560 FR
+ Italy 59000000 1937894 IT
+ Brunei 434000 12128 BN
+
+ When using ``keep='all'``, all duplicate items are maintained:
+
+ >>> df.nlargest(3, 'population', keep='all')
+ population GDP alpha-2
+ France 65000000 2583560 FR
+ Italy 59000000 1937894 IT
+ Malta 434000 12011 MT
+ Maldives 434000 4520 MV
+ Brunei 434000 12128 BN
+
+ To order by the largest values in column "population" and then "GDP",
+ we can specify multiple columns like in the next example.
+
+ >>> df.nlargest(3, ['population', 'GDP'])
+ population GDP alpha-2
+ France 65000000 2583560 FR
+ Italy 59000000 1937894 IT
+ Brunei 434000 12128 BN
+ """
+ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
+
+ def nsmallest(self, n, columns, keep="first"):
+ """
+ Return the first `n` rows ordered by `columns` in ascending order.
+
+ Return the first `n` rows with the smallest values in `columns`, in
+ ascending order. The columns that are not specified are returned as
+ well, but not used for ordering.
+
+ This method is equivalent to
+ ``df.sort_values(columns, ascending=True).head(n)``, but more
+ performant.
+
+ Parameters
+ ----------
+ n : int
+ Number of items to retrieve.
+ columns : list or str
+ Column name or names to order by.
+ keep : {'first', 'last', 'all'}, default 'first'
+ Where there are duplicate values:
+
+ - ``first`` : take the first occurrence.
+ - ``last`` : take the last occurrence.
+ - ``all`` : do not drop any duplicates, even it means
+ selecting more than `n` items.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
+ descending order.
+ DataFrame.sort_values : Sort DataFrame by the values.
+ DataFrame.head : Return the first `n` rows without re-ordering.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
+ ... 434000, 434000, 337000, 11300,
+ ... 11300, 11300],
+ ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
+ ... 17036, 182, 38, 311],
+ ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
+ ... "IS", "NR", "TV", "AI"]},
+ ... index=["Italy", "France", "Malta",
+ ... "Maldives", "Brunei", "Iceland",
+ ... "Nauru", "Tuvalu", "Anguilla"])
+ >>> df
+ population GDP alpha-2
+ Italy 59000000 1937894 IT
+ France 65000000 2583560 FR
+ Malta 434000 12011 MT
+ Maldives 434000 4520 MV
+ Brunei 434000 12128 BN
+ Iceland 337000 17036 IS
+ Nauru 11300 182 NR
+ Tuvalu 11300 38 TV
+ Anguilla 11300 311 AI
+
+ In the following example, we will use ``nsmallest`` to select the
+ three rows having the smallest values in column "a".
+
+ >>> df.nsmallest(3, 'population')
+ population GDP alpha-2
+ Nauru 11300 182 NR
+ Tuvalu 11300 38 TV
+ Anguilla 11300 311 AI
+
+ When using ``keep='last'``, ties are resolved in reverse order:
+
+ >>> df.nsmallest(3, 'population', keep='last')
+ population GDP alpha-2
+ Anguilla 11300 311 AI
+ Tuvalu 11300 38 TV
+ Nauru 11300 182 NR
+
+ When using ``keep='all'``, all duplicate items are maintained:
+
+ >>> df.nsmallest(3, 'population', keep='all')
+ population GDP alpha-2
+ Nauru 11300 182 NR
+ Tuvalu 11300 38 TV
+ Anguilla 11300 311 AI
+
+ To order by the largest values in column "a" and then "c", we can
+ specify multiple columns like in the next example.
+
+ >>> df.nsmallest(3, ['population', 'GDP'])
+ population GDP alpha-2
+ Tuvalu 11300 38 TV
+ Nauru 11300 182 NR
+ Anguilla 11300 311 AI
+ """
+ return algorithms.SelectNFrame(
+ self, n=n, keep=keep, columns=columns
+ ).nsmallest()
+
+ def swaplevel(self, i=-2, j=-1, axis=0):
+ """
+ Swap levels i and j in a MultiIndex on a particular axis.
+
+ Parameters
+ ----------
+ i, j : int, string (can be mixed)
+ Level of index to be swapped. Can pass level name as string.
+
+ Returns
+ -------
+ DataFrame
+
+ .. versionchanged:: 0.18.1
+
+ The indexes ``i`` and ``j`` are now optional, and default to
+ the two innermost levels of the index.
+ """
+ result = self.copy()
+
+ axis = self._get_axis_number(axis)
+ if axis == 0:
+ result.index = result.index.swaplevel(i, j)
+ else:
+ result.columns = result.columns.swaplevel(i, j)
+ return result
+
+ def reorder_levels(self, order, axis=0):
+ """
+ Rearrange index levels using input order. May not drop or
+ duplicate levels.
+
+ Parameters
+ ----------
+ order : list of int or list of str
+ List representing new level order. Reference level by number
+ (position) or by key (label).
+ axis : int
+ Where to reorder levels.
+
+ Returns
+ -------
+ type of caller (new object)
+ """
+ axis = self._get_axis_number(axis)
+ if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
+ raise TypeError("Can only reorder levels on a hierarchical axis.")
+
+ result = self.copy()
+
+ if axis == 0:
+ result.index = result.index.reorder_levels(order)
+ else:
+ result.columns = result.columns.reorder_levels(order)
+ return result
+
+ # ----------------------------------------------------------------------
+ # Arithmetic / combination related
+
+ def _combine_frame(self, other, func, fill_value=None, level=None):
+ this, other = self.align(other, join="outer", level=level, copy=False)
+ new_index, new_columns = this.index, this.columns
+
+ def _arith_op(left, right):
+ # for the mixed_type case where we iterate over columns,
+ # _arith_op(left, right) is equivalent to
+ # left._binop(right, func, fill_value=fill_value)
+ left, right = ops.fill_binop(left, right, fill_value)
+ return func(left, right)
+
+ if ops.should_series_dispatch(this, other, func):
+ # iterate over columns
+ return ops.dispatch_to_series(this, other, _arith_op)
+ else:
+ result = _arith_op(this.values, other.values)
+ return self._constructor(
+ result, index=new_index, columns=new_columns, copy=False
+ )
+
+ def _combine_match_index(self, other, func, level=None):
+ left, right = self.align(other, join="outer", axis=0, level=level, copy=False)
+ assert left.index.equals(right.index)
+
+ if left._is_mixed_type or right._is_mixed_type:
+ # operate column-wise; avoid costly object-casting in `.values`
+ return ops.dispatch_to_series(left, right, func)
+ else:
+ # fastpath --> operate directly on values
+ with np.errstate(all="ignore"):
+ new_data = func(left.values.T, right.values).T
+ return self._constructor(
+ new_data, index=left.index, columns=self.columns, copy=False
+ )
+
+ def _combine_match_columns(self, other, func, level=None):
+ assert isinstance(other, Series)
+ left, right = self.align(other, join="outer", axis=1, level=level, copy=False)
+ assert left.columns.equals(right.index)
+ return ops.dispatch_to_series(left, right, func, axis="columns")
+
+ def _combine_const(self, other, func):
+ assert lib.is_scalar(other) or np.ndim(other) == 0
+ return ops.dispatch_to_series(self, other, func)
+
+ def combine(self, other, func, fill_value=None, overwrite=True):
+ """
+ Perform column-wise combine with another DataFrame.
+
+ Combines a DataFrame with `other` DataFrame using `func`
+ to element-wise combine columns. The row and column indexes of the
+ resulting DataFrame will be the union of the two.
+
+ Parameters
+ ----------
+ other : DataFrame
+ The DataFrame to merge column-wise.
+ func : function
+ Function that takes two series as inputs and return a Series or a
+ scalar. Used to merge the two dataframes column by columns.
+ fill_value : scalar value, default None
+ The value to fill NaNs with prior to passing any column to the
+ merge func.
+ overwrite : bool, default True
+ If True, columns in `self` that do not exist in `other` will be
+ overwritten with NaNs.
+
+ Returns
+ -------
+ DataFrame
+ Combination of the provided DataFrames.
+
+ See Also
+ --------
+ DataFrame.combine_first : Combine two DataFrame objects and default to
+ non-null values in frame calling the method.
+
+ Examples
+ --------
+ Combine using a simple function that chooses the smaller column.
+
+ >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+ >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
+ >>> df1.combine(df2, take_smaller)
+ A B
+ 0 0 3
+ 1 0 3
+
+ Example using a true element-wise combine function.
+
+ >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+ >>> df1.combine(df2, np.minimum)
+ A B
+ 0 1 2
+ 1 0 3
+
+ Using `fill_value` fills Nones prior to passing the column to the
+ merge function.
+
+ >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+ >>> df1.combine(df2, take_smaller, fill_value=-5)
+ A B
+ 0 0 -5.0
+ 1 0 4.0
+
+ However, if the same element in both dataframes is None, that None
+ is preserved
+
+ >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
+ >>> df1.combine(df2, take_smaller, fill_value=-5)
+ A B
+ 0 0 -5.0
+ 1 0 3.0
+
+ Example that demonstrates the use of `overwrite` and behavior when
+ the axis differ between the dataframes.
+
+ >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
+ >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
+ >>> df1.combine(df2, take_smaller)
+ A B C
+ 0 NaN NaN NaN
+ 1 NaN 3.0 -10.0
+ 2 NaN 3.0 1.0
+
+ >>> df1.combine(df2, take_smaller, overwrite=False)
+ A B C
+ 0 0.0 NaN NaN
+ 1 0.0 3.0 -10.0
+ 2 NaN 3.0 1.0
+
+ Demonstrating the preference of the passed in dataframe.
+
+ >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
+ >>> df2.combine(df1, take_smaller)
+ A B C
+ 0 0.0 NaN NaN
+ 1 0.0 3.0 NaN
+ 2 NaN 3.0 NaN
+
+ >>> df2.combine(df1, take_smaller, overwrite=False)
+ A B C
+ 0 0.0 NaN NaN
+ 1 0.0 3.0 1.0
+ 2 NaN 3.0 1.0
+ """
+ other_idxlen = len(other.index) # save for compare
+
+ this, other = self.align(other, copy=False)
+ new_index = this.index
+
+ if other.empty and len(new_index) == len(self.index):
+ return self.copy()
+
+ if self.empty and len(other) == other_idxlen:
+ return other.copy()
+
+ # sorts if possible
+ new_columns = this.columns.union(other.columns)
+ do_fill = fill_value is not None
+ result = {}
+ for col in new_columns:
+ series = this[col]
+ otherSeries = other[col]
+
+ this_dtype = series.dtype
+ other_dtype = otherSeries.dtype
+
+ this_mask = isna(series)
+ other_mask = isna(otherSeries)
+
+ # don't overwrite columns unnecessarily
+ # DO propagate if this column is not in the intersection
+ if not overwrite and other_mask.all():
+ result[col] = this[col].copy()
+ continue
+
+ if do_fill:
+ series = series.copy()
+ otherSeries = otherSeries.copy()
+ series[this_mask] = fill_value
+ otherSeries[other_mask] = fill_value
+
+ if col not in self.columns:
+ # If self DataFrame does not have col in other DataFrame,
+ # try to promote series, which is all NaN, as other_dtype.
+ new_dtype = other_dtype
+ try:
+ series = series.astype(new_dtype, copy=False)
+ except ValueError:
+ # e.g. new_dtype is integer types
+ pass
+ else:
+ # if we have different dtypes, possibly promote
+ new_dtype = find_common_type([this_dtype, other_dtype])
+ if not is_dtype_equal(this_dtype, new_dtype):
+ series = series.astype(new_dtype)
+ if not is_dtype_equal(other_dtype, new_dtype):
+ otherSeries = otherSeries.astype(new_dtype)
+
+ arr = func(series, otherSeries)
+ arr = maybe_downcast_to_dtype(arr, this_dtype)
+
+ result[col] = arr
+
+ # convert_objects just in case
+ return self._constructor(result, index=new_index, columns=new_columns)
+
+ def combine_first(self, other):
+ """
+ Update null elements with value in the same location in `other`.
+
+ Combine two DataFrame objects by filling null values in one DataFrame
+ with non-null values from other DataFrame. The row and column indexes
+ of the resulting DataFrame will be the union of the two.
+
+ Parameters
+ ----------
+ other : DataFrame
+ Provided DataFrame to use to fill null values.
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ DataFrame.combine : Perform series-wise operation on two DataFrames
+ using a given function.
+
+ Examples
+ --------
+
+ >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
+ >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
+ >>> df1.combine_first(df2)
+ A B
+ 0 1.0 3.0
+ 1 0.0 4.0
+
+ Null values still persist if the location of that null value
+ does not exist in `other`
+
+ >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
+ >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
+ >>> df1.combine_first(df2)
+ A B C
+ 0 NaN 4.0 NaN
+ 1 0.0 3.0 1.0
+ 2 NaN 3.0 1.0
+ """
+ import pandas.core.computation.expressions as expressions
+
+ def extract_values(arr):
+ # Does two things:
+ # 1. maybe gets the values from the Series / Index
+ # 2. convert datelike to i8
+ if isinstance(arr, (ABCIndexClass, ABCSeries)):
+ arr = arr._values
+
+ if needs_i8_conversion(arr):
+ if is_extension_array_dtype(arr.dtype):
+ arr = arr.asi8
+ else:
+ arr = arr.view("i8")
+ return arr
+
+ def combiner(x, y):
+ mask = isna(x)
+ if isinstance(mask, (ABCIndexClass, ABCSeries)):
+ mask = mask._values
+
+ x_values = extract_values(x)
+ y_values = extract_values(y)
+
+ # If the column y in other DataFrame is not in first DataFrame,
+ # just return y_values.
+ if y.name not in self.columns:
+ return y_values
+
+ return expressions.where(mask, y_values, x_values)
+
+ return self.combine(other, combiner, overwrite=False)
+
+ @deprecate_kwarg(
+ old_arg_name="raise_conflict",
+ new_arg_name="errors",
+ mapping={False: "ignore", True: "raise"},
+ )
+ def update(
+ self, other, join="left", overwrite=True, filter_func=None, errors="ignore"
+ ):
+ """
+ Modify in place using non-NA values from another DataFrame.
+
+ Aligns on indices. There is no return value.
+
+ Parameters
+ ----------
+ other : DataFrame, or object coercible into a DataFrame
+ Should have at least one matching index/column label
+ with the original DataFrame. If a Series is passed,
+ its name attribute must be set, and that will be
+ used as the column name to align with the original DataFrame.
+ join : {'left'}, default 'left'
+ Only left join is implemented, keeping the index and columns of the
+ original object.
+ overwrite : bool, default True
+ How to handle non-NA values for overlapping keys:
+
+ * True: overwrite original DataFrame's values
+ with values from `other`.
+ * False: only update values that are NA in
+ the original DataFrame.
+
+ filter_func : callable(1d-array) -> bool 1d-array, optional
+ Can choose to replace values other than NA. Return True for values
+ that should be updated.
+ errors : {'raise', 'ignore'}, default 'ignore'
+ If 'raise', will raise a ValueError if the DataFrame and `other`
+ both contain non-NA data in the same place.
+
+ .. versionchanged :: 0.24.0
+ Changed from `raise_conflict=False|True`
+ to `errors='ignore'|'raise'`.
+
+ Returns
+ -------
+ None : method directly changes calling object
+
+ Raises
+ ------
+ ValueError
+ * When `errors='raise'` and there's overlapping non-NA data.
+ * When `errors` is not either `'ignore'` or `'raise'`
+ NotImplementedError
+ * If `join != 'left'`
+
+ See Also
+ --------
+ dict.update : Similar method for dictionaries.
+ DataFrame.merge : For column(s)-on-columns(s) operations.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': [1, 2, 3],
+ ... 'B': [400, 500, 600]})
+ >>> new_df = pd.DataFrame({'B': [4, 5, 6],
+ ... 'C': [7, 8, 9]})
+ >>> df.update(new_df)
+ >>> df
+ A B
+ 0 1 4
+ 1 2 5
+ 2 3 6
+
+ The DataFrame's length does not increase as a result of the update,
+ only values at matching index/column labels are updated.
+
+ >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
+ ... 'B': ['x', 'y', 'z']})
+ >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
+ >>> df.update(new_df)
+ >>> df
+ A B
+ 0 a d
+ 1 b e
+ 2 c f
+
+ For Series, it's name attribute must be set.
+
+ >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
+ ... 'B': ['x', 'y', 'z']})
+ >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
+ >>> df.update(new_column)
+ >>> df
+ A B
+ 0 a d
+ 1 b y
+ 2 c e
+ >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
+ ... 'B': ['x', 'y', 'z']})
+ >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
+ >>> df.update(new_df)
+ >>> df
+ A B
+ 0 a x
+ 1 b d
+ 2 c e
+
+ If `other` contains NaNs the corresponding values are not updated
+ in the original dataframe.
+
+ >>> df = pd.DataFrame({'A': [1, 2, 3],
+ ... 'B': [400, 500, 600]})
+ >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
+ >>> df.update(new_df)
+ >>> df
+ A B
+ 0 1 4.0
+ 1 2 500.0
+ 2 3 6.0
+ """
+ import pandas.core.computation.expressions as expressions
+
+ # TODO: Support other joins
+ if join != "left": # pragma: no cover
+ raise NotImplementedError("Only left join is supported")
+ if errors not in ["ignore", "raise"]:
+ raise ValueError(
+ "The parameter errors must be either " "'ignore' or 'raise'"
+ )
+
+ if not isinstance(other, DataFrame):
+ other = DataFrame(other)
+
+ other = other.reindex_like(self)
+
+ for col in self.columns:
+ this = self[col]._values
+ that = other[col]._values
+ if filter_func is not None:
+ with np.errstate(all="ignore"):
+ mask = ~filter_func(this) | isna(that)
+ else:
+ if errors == "raise":
+ mask_this = notna(that)
+ mask_that = notna(this)
+ if any(mask_this & mask_that):
+ raise ValueError("Data overlaps.")
+
+ if overwrite:
+ mask = isna(that)
+ else:
+ mask = notna(this)
+
+ # don't overwrite columns unnecessarily
+ if mask.all():
+ continue
+
+ self[col] = expressions.where(mask, this, that)
+
+ # ----------------------------------------------------------------------
+ # Data reshaping
+
+ _shared_docs[
+ "pivot"
+ ] = """
+ Return reshaped DataFrame organized by given index / column values.
+
+ Reshape data (produce a "pivot" table) based on column values. Uses
+ unique values from specified `index` / `columns` to form axes of the
+ resulting DataFrame. This function does not support data
+ aggregation, multiple values will result in a MultiIndex in the
+ columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
+
+ Parameters
+ ----------%s
+ index : string or object, optional
+ Column to use to make new frame's index. If None, uses
+ existing index.
+ columns : string or object
+ Column to use to make new frame's columns.
+ values : string, object or a list of the previous, optional
+ Column(s) to use for populating new frame's values. If not
+ specified, all remaining columns will be used and the result will
+ have hierarchically indexed columns.
+
+ .. versionchanged :: 0.23.0
+ Also accept list of column names.
+
+ Returns
+ -------
+ DataFrame
+ Returns reshaped DataFrame.
+
+ Raises
+ ------
+ ValueError:
+ When there are any `index`, `columns` combinations with multiple
+ values. `DataFrame.pivot_table` when you need to aggregate.
+
+ See Also
+ --------
+ DataFrame.pivot_table : Generalization of pivot that can handle
+ duplicate values for one index/column pair.
+ DataFrame.unstack : Pivot based on the index values instead of a
+ column.
+
+ Notes
+ -----
+ For finer-tuned control, see hierarchical indexing documentation along
+ with the related stack/unstack methods.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
+ ... 'two'],
+ ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+ ... 'baz': [1, 2, 3, 4, 5, 6],
+ ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
+ >>> df
+ foo bar baz zoo
+ 0 one A 1 x
+ 1 one B 2 y
+ 2 one C 3 z
+ 3 two A 4 q
+ 4 two B 5 w
+ 5 two C 6 t
+
+ >>> df.pivot(index='foo', columns='bar', values='baz')
+ bar A B C
+ foo
+ one 1 2 3
+ two 4 5 6
+
+ >>> df.pivot(index='foo', columns='bar')['baz']
+ bar A B C
+ foo
+ one 1 2 3
+ two 4 5 6
+
+ >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
+ baz zoo
+ bar A B C A B C
+ foo
+ one 1 2 3 x y z
+ two 4 5 6 q w t
+
+ A ValueError is raised if there are any duplicates.
+
+ >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
+ ... "bar": ['A', 'A', 'B', 'C'],
+ ... "baz": [1, 2, 3, 4]})
+ >>> df
+ foo bar baz
+ 0 one A 1
+ 1 one A 2
+ 2 two B 3
+ 3 two C 4
+
+ Notice that the first two rows are the same for our `index`
+ and `columns` arguments.
+
+ >>> df.pivot(index='foo', columns='bar', values='baz')
+ Traceback (most recent call last):
+ ...
+ ValueError: Index contains duplicate entries, cannot reshape
+ """
+
+ @Substitution("")
+ @Appender(_shared_docs["pivot"])
+ def pivot(self, index=None, columns=None, values=None):
+ from pandas.core.reshape.pivot import pivot
+
+ return pivot(self, index=index, columns=columns, values=values)
+
+ _shared_docs[
+ "pivot_table"
+ ] = """
+ Create a spreadsheet-style pivot table as a DataFrame. The levels in
+ the pivot table will be stored in MultiIndex objects (hierarchical
+ indexes) on the index and columns of the result DataFrame.
+
+ Parameters
+ ----------%s
+ values : column to aggregate, optional
+ index : column, Grouper, array, or list of the previous
+ If an array is passed, it must be the same length as the data. The
+ list can contain any of the other types (except list).
+ Keys to group by on the pivot table index. If an array is passed,
+ it is being used as the same manner as column values.
+ columns : column, Grouper, array, or list of the previous
+ If an array is passed, it must be the same length as the data. The
+ list can contain any of the other types (except list).
+ Keys to group by on the pivot table column. If an array is passed,
+ it is being used as the same manner as column values.
+ aggfunc : function, list of functions, dict, default numpy.mean
+ If list of functions passed, the resulting pivot table will have
+ hierarchical columns whose top level are the function names
+ (inferred from the function objects themselves)
+ If dict is passed, the key is column to aggregate and value
+ is function or list of functions
+ fill_value : scalar, default None
+ Value to replace missing values with
+ margins : boolean, default False
+ Add all row / columns (e.g. for subtotal / grand totals)
+ dropna : boolean, default True
+ Do not include columns whose entries are all NaN
+ margins_name : string, default 'All'
+ Name of the row / column that will contain the totals
+ when margins is True.
+ observed : boolean, default False
+ This only applies if any of the groupers are Categoricals.
+ If True: only show observed values for categorical groupers.
+ If False: show all values for categorical groupers.
+
+ .. versionchanged :: 0.25.0
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ DataFrame.pivot : Pivot without aggregation that can handle
+ non-numeric data.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
+ ... "bar", "bar", "bar", "bar"],
+ ... "B": ["one", "one", "one", "two", "two",
+ ... "one", "one", "two", "two"],
+ ... "C": ["small", "large", "large", "small",
+ ... "small", "large", "small", "small",
+ ... "large"],
+ ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+ ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
+ >>> df
+ A B C D E
+ 0 foo one small 1 2
+ 1 foo one large 2 4
+ 2 foo one large 2 5
+ 3 foo two small 3 5
+ 4 foo two small 3 6
+ 5 bar one large 4 6
+ 6 bar one small 5 8
+ 7 bar two small 6 9
+ 8 bar two large 7 9
+
+ This first example aggregates values by taking the sum.
+
+ >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
+ ... columns=['C'], aggfunc=np.sum)
+ >>> table
+ C large small
+ A B
+ bar one 4.0 5.0
+ two 7.0 6.0
+ foo one 4.0 1.0
+ two NaN 6.0
+
+ We can also fill missing values using the `fill_value` parameter.
+
+ >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
+ ... columns=['C'], aggfunc=np.sum, fill_value=0)
+ >>> table
+ C large small
+ A B
+ bar one 4 5
+ two 7 6
+ foo one 4 1
+ two 0 6
+
+ The next example aggregates by taking the mean across multiple columns.
+
+ >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+ ... aggfunc={'D': np.mean,
+ ... 'E': np.mean})
+ >>> table
+ D E
+ A C
+ bar large 5.500000 7.500000
+ small 5.500000 8.500000
+ foo large 2.000000 4.500000
+ small 2.333333 4.333333
+
+ We can also calculate multiple types of aggregations for any given
+ value column.
+
+ >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+ ... aggfunc={'D': np.mean,
+ ... 'E': [min, max, np.mean]})
+ >>> table
+ D E
+ mean max mean min
+ A C
+ bar large 5.500000 9.0 7.500000 6.0
+ small 5.500000 9.0 8.500000 8.0
+ foo large 2.000000 5.0 4.500000 4.0
+ small 2.333333 6.0 4.333333 2.0
+ """
+
+ @Substitution("")
+ @Appender(_shared_docs["pivot_table"])
+ def pivot_table(
+ self,
+ values=None,
+ index=None,
+ columns=None,
+ aggfunc="mean",
+ fill_value=None,
+ margins=False,
+ dropna=True,
+ margins_name="All",
+ observed=False,
+ ):
+ from pandas.core.reshape.pivot import pivot_table
+
+ return pivot_table(
+ self,
+ values=values,
+ index=index,
+ columns=columns,
+ aggfunc=aggfunc,
+ fill_value=fill_value,
+ margins=margins,
+ dropna=dropna,
+ margins_name=margins_name,
+ observed=observed,
+ )
+
+ def stack(self, level=-1, dropna=True):
+ """
+ Stack the prescribed level(s) from columns to index.
+
+ Return a reshaped DataFrame or Series having a multi-level
+ index with one or more new inner-most levels compared to the current
+ DataFrame. The new inner-most levels are created by pivoting the
+ columns of the current dataframe:
+
+ - if the columns have a single level, the output is a Series;
+ - if the columns have multiple levels, the new index
+ level(s) is (are) taken from the prescribed level(s) and
+ the output is a DataFrame.
+
+ The new index levels are sorted.
+
+ Parameters
+ ----------
+ level : int, str, list, default -1
+ Level(s) to stack from the column axis onto the index
+ axis, defined as one index or label, or a list of indices
+ or labels.
+ dropna : bool, default True
+ Whether to drop rows in the resulting Frame/Series with
+ missing values. Stacking a column level onto the index
+ axis can create combinations of index and column values
+ that are missing from the original dataframe. See Examples
+ section.
+
+ Returns
+ -------
+ DataFrame or Series
+ Stacked dataframe or series.
+
+ See Also
+ --------
+ DataFrame.unstack : Unstack prescribed level(s) from index axis
+ onto column axis.
+ DataFrame.pivot : Reshape dataframe from long format to wide
+ format.
+ DataFrame.pivot_table : Create a spreadsheet-style pivot table
+ as a DataFrame.
+
+ Notes
+ -----
+ The function is named by analogy with a collection of books
+ being reorganized from being side by side on a horizontal
+ position (the columns of the dataframe) to being stacked
+ vertically on top of each other (in the index of the
+ dataframe).
+
+ Examples
+ --------
+ **Single level columns**
+
+ >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
+ ... index=['cat', 'dog'],
+ ... columns=['weight', 'height'])
+
+ Stacking a dataframe with a single level column axis returns a Series:
+
+ >>> df_single_level_cols
+ weight height
+ cat 0 1
+ dog 2 3
+ >>> df_single_level_cols.stack()
+ cat weight 0
+ height 1
+ dog weight 2
+ height 3
+ dtype: int64
+
+ **Multi level columns: simple case**
+
+ >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+ ... ('weight', 'pounds')])
+ >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
+ ... index=['cat', 'dog'],
+ ... columns=multicol1)
+
+ Stacking a dataframe with a multi-level column axis:
+
+ >>> df_multi_level_cols1
+ weight
+ kg pounds
+ cat 1 2
+ dog 2 4
+ >>> df_multi_level_cols1.stack()
+ weight
+ cat kg 1
+ pounds 2
+ dog kg 2
+ pounds 4
+
+ **Missing values**
+
+ >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+ ... ('height', 'm')])
+ >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
+ ... index=['cat', 'dog'],
+ ... columns=multicol2)
+
+ It is common to have missing values when stacking a dataframe
+ with multi-level columns, as the stacked dataframe typically
+ has more values than the original dataframe. Missing values
+ are filled with NaNs:
+
+ >>> df_multi_level_cols2
+ weight height
+ kg m
+ cat 1.0 2.0
+ dog 3.0 4.0
+ >>> df_multi_level_cols2.stack()
+ height weight
+ cat kg NaN 1.0
+ m 2.0 NaN
+ dog kg NaN 3.0
+ m 4.0 NaN
+
+ **Prescribing the level(s) to be stacked**
+
+ The first parameter controls which level or levels are stacked:
+
+ >>> df_multi_level_cols2.stack(0)
+ kg m
+ cat height NaN 2.0
+ weight 1.0 NaN
+ dog height NaN 4.0
+ weight 3.0 NaN
+ >>> df_multi_level_cols2.stack([0, 1])
+ cat height m 2.0
+ weight kg 1.0
+ dog height m 4.0
+ weight kg 3.0
+ dtype: float64
+
+ **Dropping missing values**
+
+ >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
+ ... index=['cat', 'dog'],
+ ... columns=multicol2)
+
+ Note that rows where all values are missing are dropped by
+ default but this behaviour can be controlled via the dropna
+ keyword parameter:
+
+ >>> df_multi_level_cols3
+ weight height
+ kg m
+ cat NaN 1.0
+ dog 2.0 3.0
+ >>> df_multi_level_cols3.stack(dropna=False)
+ height weight
+ cat kg NaN NaN
+ m 1.0 NaN
+ dog kg NaN 2.0
+ m 3.0 NaN
+ >>> df_multi_level_cols3.stack(dropna=True)
+ height weight
+ cat m 1.0 NaN
+ dog kg NaN 2.0
+ m 3.0 NaN
+ """
+ from pandas.core.reshape.reshape import stack, stack_multiple
+
+ if isinstance(level, (tuple, list)):
+ return stack_multiple(self, level, dropna=dropna)
+ else:
+ return stack(self, level, dropna=dropna)
+
+ def explode(self, column: Union[str, Tuple]) -> "DataFrame":
+ """
+ Transform each element of a list-like to a row, replicating the
+ index values.
+
+ .. versionadded:: 0.25.0
+
+ Parameters
+ ----------
+ column : str or tuple
+
+ Returns
+ -------
+ DataFrame
+ Exploded lists to rows of the subset columns;
+ index will be duplicated for these rows.
+
+ Raises
+ ------
+ ValueError :
+ if columns of the frame are not unique.
+
+ See Also
+ --------
+ DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
+ index labels
+ DataFrame.melt : Unpivot a DataFrame from wide format to long format
+ Series.explode : Explode a DataFrame from list-like columns to long format.
+
+ Notes
+ -----
+ This routine will explode list-likes including lists, tuples,
+ Series, and np.ndarray. The result dtype of the subset rows will
+ be object. Scalars will be returned unchanged. Empty list-likes will
+ result in a np.nan for that row.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
+ >>> df
+ A B
+ 0 [1, 2, 3] 1
+ 1 foo 1
+ 2 [] 1
+ 3 [3, 4] 1
+
+ >>> df.explode('A')
+ A B
+ 0 1 1
+ 0 2 1
+ 0 3 1
+ 1 foo 1
+ 2 NaN 1
+ 3 3 1
+ 3 4 1
+ """
+
+ if not (is_scalar(column) or isinstance(column, tuple)):
+ raise ValueError("column must be a scalar")
+ if not self.columns.is_unique:
+ raise ValueError("columns must be unique")
+
+ result = self[column].explode()
+ return (
+ self.drop([column], axis=1)
+ .join(result)
+ .reindex(columns=self.columns, copy=False)
+ )
+
+ def unstack(self, level=-1, fill_value=None):
+ """
+ Pivot a level of the (necessarily hierarchical) index labels, returning
+ a DataFrame having a new level of column labels whose inner-most level
+ consists of the pivoted index labels.
+
+ If the index is not a MultiIndex, the output will be a Series
+ (the analogue of stack when the columns are not a MultiIndex).
+
+ The level involved will automatically get sorted.
+
+ Parameters
+ ----------
+ level : int, string, or list of these, default -1 (last level)
+ Level(s) of index to unstack, can pass level name
+ fill_value : replace NaN with this value if the unstack produces
+ missing values
+
+ .. versionadded:: 0.18.0
+
+ Returns
+ -------
+ Series or DataFrame
+
+ See Also
+ --------
+ DataFrame.pivot : Pivot a table based on column values.
+ DataFrame.stack : Pivot a level of the column labels (inverse operation
+ from `unstack`).
+
+ Examples
+ --------
+ >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
+ ... ('two', 'a'), ('two', 'b')])
+ >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
+ >>> s
+ one a 1.0
+ b 2.0
+ two a 3.0
+ b 4.0
+ dtype: float64
+
+ >>> s.unstack(level=-1)
+ a b
+ one 1.0 2.0
+ two 3.0 4.0
+
+ >>> s.unstack(level=0)
+ one two
+ a 1.0 3.0
+ b 2.0 4.0
+
+ >>> df = s.unstack(level=0)
+ >>> df.unstack()
+ one a 1.0
+ b 2.0
+ two a 3.0
+ b 4.0
+ dtype: float64
+ """
+ from pandas.core.reshape.reshape import unstack
+
+ return unstack(self, level, fill_value)
+
+ _shared_docs[
+ "melt"
+ ] = """
+ Unpivot a DataFrame from wide format to long format, optionally
+ leaving identifier variables set.
+
+ This function is useful to massage a DataFrame into a format where one
+ or more columns are identifier variables (`id_vars`), while all other
+ columns, considered measured variables (`value_vars`), are "unpivoted" to
+ the row axis, leaving just two non-identifier columns, 'variable' and
+ 'value'.
+ %(versionadded)s
+ Parameters
+ ----------
+ frame : DataFrame
+ id_vars : tuple, list, or ndarray, optional
+ Column(s) to use as identifier variables.
+ value_vars : tuple, list, or ndarray, optional
+ Column(s) to unpivot. If not specified, uses all columns that
+ are not set as `id_vars`.
+ var_name : scalar
+ Name to use for the 'variable' column. If None it uses
+ ``frame.columns.name`` or 'variable'.
+ value_name : scalar, default 'value'
+ Name to use for the 'value' column.
+ col_level : int or string, optional
+ If columns are a MultiIndex then use this level to melt.
+
+ Returns
+ -------
+ DataFrame
+ Unpivoted DataFrame.
+
+ See Also
+ --------
+ %(other)s
+ pivot_table
+ DataFrame.pivot
+ Series.explode
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
+ ... 'B': {0: 1, 1: 3, 2: 5},
+ ... 'C': {0: 2, 1: 4, 2: 6}})
+ >>> df
+ A B C
+ 0 a 1 2
+ 1 b 3 4
+ 2 c 5 6
+
+ >>> %(caller)sid_vars=['A'], value_vars=['B'])
+ A variable value
+ 0 a B 1
+ 1 b B 3
+ 2 c B 5
+
+ >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
+ A variable value
+ 0 a B 1
+ 1 b B 3
+ 2 c B 5
+ 3 a C 2
+ 4 b C 4
+ 5 c C 6
+
+ The names of 'variable' and 'value' columns can be customized:
+
+ >>> %(caller)sid_vars=['A'], value_vars=['B'],
+ ... var_name='myVarname', value_name='myValname')
+ A myVarname myValname
+ 0 a B 1
+ 1 b B 3
+ 2 c B 5
+
+ If you have multi-index columns:
+
+ >>> df.columns = [list('ABC'), list('DEF')]
+ >>> df
+ A B C
+ D E F
+ 0 a 1 2
+ 1 b 3 4
+ 2 c 5 6
+
+ >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
+ A variable value
+ 0 a B 1
+ 1 b B 3
+ 2 c B 5
+
+ >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
+ (A, D) variable_0 variable_1 value
+ 0 a B E 1
+ 1 b B E 3
+ 2 c B E 5
+ """
+
+ @Appender(
+ _shared_docs["melt"]
+ % dict(
+ caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt"
+ )
+ )
+ def melt(
+ self,
+ id_vars=None,
+ value_vars=None,
+ var_name=None,
+ value_name="value",
+ col_level=None,
+ ):
+ from pandas.core.reshape.melt import melt
+
+ return melt(
+ self,
+ id_vars=id_vars,
+ value_vars=value_vars,
+ var_name=var_name,
+ value_name=value_name,
+ col_level=col_level,
+ )
+
+ # ----------------------------------------------------------------------
+ # Time series-related
+
+ def diff(self, periods=1, axis=0):
+ """
+ First discrete difference of element.
+
+ Calculates the difference of a DataFrame element compared with another
+ element in the DataFrame (default is the element in the same column
+ of the previous row).
+
+ Parameters
+ ----------
+ periods : int, default 1
+ Periods to shift for calculating difference, accepts negative
+ values.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Take difference over rows (0) or columns (1).
+
+ .. versionadded:: 0.16.1.
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ Series.diff: First discrete difference for a Series.
+ DataFrame.pct_change: Percent change over given number of periods.
+ DataFrame.shift: Shift index by desired number of periods with an
+ optional time freq.
+
+ Examples
+ --------
+ Difference with previous row
+
+ >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
+ ... 'b': [1, 1, 2, 3, 5, 8],
+ ... 'c': [1, 4, 9, 16, 25, 36]})
+ >>> df
+ a b c
+ 0 1 1 1
+ 1 2 1 4
+ 2 3 2 9
+ 3 4 3 16
+ 4 5 5 25
+ 5 6 8 36
+
+ >>> df.diff()
+ a b c
+ 0 NaN NaN NaN
+ 1 1.0 0.0 3.0
+ 2 1.0 1.0 5.0
+ 3 1.0 1.0 7.0
+ 4 1.0 2.0 9.0
+ 5 1.0 3.0 11.0
+
+ Difference with previous column
+
+ >>> df.diff(axis=1)
+ a b c
+ 0 NaN 0.0 0.0
+ 1 NaN -1.0 3.0
+ 2 NaN -1.0 7.0
+ 3 NaN -1.0 13.0
+ 4 NaN 0.0 20.0
+ 5 NaN 2.0 28.0
+
+ Difference with 3rd previous row
+
+ >>> df.diff(periods=3)
+ a b c
+ 0 NaN NaN NaN
+ 1 NaN NaN NaN
+ 2 NaN NaN NaN
+ 3 3.0 2.0 15.0
+ 4 3.0 4.0 21.0
+ 5 3.0 6.0 27.0
+
+ Difference with following row
+
+ >>> df.diff(periods=-1)
+ a b c
+ 0 -1.0 0.0 -3.0
+ 1 -1.0 -1.0 -5.0
+ 2 -1.0 -1.0 -7.0
+ 3 -1.0 -2.0 -9.0
+ 4 -1.0 -3.0 -11.0
+ 5 NaN NaN NaN
+ """
+ bm_axis = self._get_block_manager_axis(axis)
+ new_data = self._data.diff(n=periods, axis=bm_axis)
+ return self._constructor(new_data)
+
+ # ----------------------------------------------------------------------
+ # Function application
+
+ def _gotitem(
+ self,
+ key: Union[str, List[str]],
+ ndim: int,
+ subset: Optional[Union[Series, ABCDataFrame]] = None,
+ ) -> Union[Series, ABCDataFrame]:
+ """
+ Sub-classes to define. Return a sliced object.
+
+ Parameters
+ ----------
+ key : string / list of selections
+ ndim : 1,2
+ requested ndim of result
+ subset : object, default None
+ subset to act on
+ """
+ if subset is None:
+ subset = self
+ elif subset.ndim == 1: # is Series
+ return subset
+
+ # TODO: _shallow_copy(subset)?
+ return subset[key]
+
+ _agg_summary_and_see_also_doc = dedent(
+ """
+ The aggregation operations are always performed over an axis, either the
+ index (default) or the column axis. This behavior is different from
+ `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
+ `var`), where the default is to compute the aggregation of the flattened
+ array, e.g., ``numpy.mean(arr_2d)`` as opposed to
+ ``numpy.mean(arr_2d, axis=0)``.
+
+ `agg` is an alias for `aggregate`. Use the alias.
+
+ See Also
+ --------
+ DataFrame.apply : Perform any type of operations.
+ DataFrame.transform : Perform transformation type operations.
+ core.groupby.GroupBy : Perform operations over groups.
+ core.resample.Resampler : Perform operations over resampled bins.
+ core.window.Rolling : Perform operations over rolling window.
+ core.window.Expanding : Perform operations over expanding window.
+ core.window.EWM : Perform operation over exponential weighted
+ window.
+ """
+ )
+
+ _agg_examples_doc = dedent(
+ """
+ Examples
+ --------
+ >>> df = pd.DataFrame([[1, 2, 3],
+ ... [4, 5, 6],
+ ... [7, 8, 9],
+ ... [np.nan, np.nan, np.nan]],
+ ... columns=['A', 'B', 'C'])
+
+ Aggregate these functions over the rows.
+
+ >>> df.agg(['sum', 'min'])
+ A B C
+ sum 12.0 15.0 18.0
+ min 1.0 2.0 3.0
+
+ Different aggregations per column.
+
+ >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
+ A B
+ max NaN 8.0
+ min 1.0 2.0
+ sum 12.0 NaN
+
+ Aggregate over the columns.
+
+ >>> df.agg("mean", axis="columns")
+ 0 2.0
+ 1 5.0
+ 2 8.0
+ 3 NaN
+ dtype: float64
+ """
+ )
+
+ @Substitution(
+ see_also=_agg_summary_and_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="\n.. versionadded:: 0.20.0\n",
+ **_shared_doc_kwargs
+ )
+ @Appender(_shared_docs["aggregate"])
+ def aggregate(self, func, axis=0, *args, **kwargs):
+ axis = self._get_axis_number(axis)
+
+ result = None
+ try:
+ result, how = self._aggregate(func, axis=axis, *args, **kwargs)
+ except TypeError:
+ pass
+ if result is None:
+ return self.apply(func, axis=axis, args=args, **kwargs)
+ return result
+
+ def _aggregate(self, arg, axis=0, *args, **kwargs):
+ if axis == 1:
+ # NDFrame.aggregate returns a tuple, and we need to transpose
+ # only result
+ result, how = self.T._aggregate(arg, *args, **kwargs)
+ result = result.T if result is not None else result
+ return result, how
+ return super()._aggregate(arg, *args, **kwargs)
+
+ agg = aggregate
+
+ @Appender(_shared_docs["transform"] % _shared_doc_kwargs)
+ def transform(self, func, axis=0, *args, **kwargs):
+ axis = self._get_axis_number(axis)
+ if axis == 1:
+ return self.T.transform(func, *args, **kwargs).T
+ return super().transform(func, *args, **kwargs)
+
+ def apply(
+ self,
+ func,
+ axis=0,
+ broadcast=None,
+ raw=False,
+ reduce=None,
+ result_type=None,
+ args=(),
+ **kwds
+ ):
+ """
+ Apply a function along an axis of the DataFrame.
+
+ Objects passed to the function are Series objects whose index is
+ either the DataFrame's index (``axis=0``) or the DataFrame's columns
+ (``axis=1``). By default (``result_type=None``), the final return type
+ is inferred from the return type of the applied function. Otherwise,
+ it depends on the `result_type` argument.
+
+ Parameters
+ ----------
+ func : function
+ Function to apply to each column or row.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Axis along which the function is applied:
+
+ * 0 or 'index': apply function to each column.
+ * 1 or 'columns': apply function to each row.
+ broadcast : bool, optional
+ Only relevant for aggregation functions:
+
+ * ``False`` or ``None`` : returns a Series whose length is the
+ length of the index or the number of columns (based on the
+ `axis` parameter)
+ * ``True`` : results will be broadcast to the original shape
+ of the frame, the original index and columns will be retained.
+
+ .. deprecated:: 0.23.0
+ This argument will be removed in a future version, replaced
+ by result_type='broadcast'.
+
+ raw : bool, default False
+ * ``False`` : passes each row or column as a Series to the
+ function.
+ * ``True`` : the passed function will receive ndarray objects
+ instead.
+ If you are just applying a NumPy reduction function this will
+ achieve much better performance.
+ reduce : bool or None, default None
+ Try to apply reduction procedures. If the DataFrame is empty,
+ `apply` will use `reduce` to determine whether the result
+ should be a Series or a DataFrame. If ``reduce=None`` (the
+ default), `apply`'s return value will be guessed by calling
+ `func` on an empty Series
+ (note: while guessing, exceptions raised by `func` will be
+ ignored).
+ If ``reduce=True`` a Series will always be returned, and if
+ ``reduce=False`` a DataFrame will always be returned.
+
+ .. deprecated:: 0.23.0
+ This argument will be removed in a future version, replaced
+ by ``result_type='reduce'``.
+
+ result_type : {'expand', 'reduce', 'broadcast', None}, default None
+ These only act when ``axis=1`` (columns):
+
+ * 'expand' : list-like results will be turned into columns.
+ * 'reduce' : returns a Series if possible rather than expanding
+ list-like results. This is the opposite of 'expand'.
+ * 'broadcast' : results will be broadcast to the original shape
+ of the DataFrame, the original index and columns will be
+ retained.
+
+ The default behaviour (None) depends on the return value of the
+ applied function: list-like results will be returned as a Series
+ of those. However if the apply function returns a Series these
+ are expanded to columns.
+
+ .. versionadded:: 0.23.0
+
+ args : tuple
+ Positional arguments to pass to `func` in addition to the
+ array/series.
+ **kwds
+ Additional keyword arguments to pass as keywords arguments to
+ `func`.
+
+ Returns
+ -------
+ Series or DataFrame
+ Result of applying ``func`` along the given axis of the
+ DataFrame.
+
+ See Also
+ --------
+ DataFrame.applymap: For elementwise operations.
+ DataFrame.aggregate: Only perform aggregating type operations.
+ DataFrame.transform: Only perform transforming type operations.
+
+ Notes
+ -----
+ In the current implementation apply calls `func` twice on the
+ first column/row to decide whether it can take a fast or slow
+ code path. This can lead to unexpected behavior if `func` has
+ side-effects, as they will take effect twice for the first
+ column/row.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
+ >>> df
+ A B
+ 0 4 9
+ 1 4 9
+ 2 4 9
+
+ Using a numpy universal function (in this case the same as
+ ``np.sqrt(df)``):
+
+ >>> df.apply(np.sqrt)
+ A B
+ 0 2.0 3.0
+ 1 2.0 3.0
+ 2 2.0 3.0
+
+ Using a reducing function on either axis
+
+ >>> df.apply(np.sum, axis=0)
+ A 12
+ B 27
+ dtype: int64
+
+ >>> df.apply(np.sum, axis=1)
+ 0 13
+ 1 13
+ 2 13
+ dtype: int64
+
+ Returning a list-like will result in a Series
+
+ >>> df.apply(lambda x: [1, 2], axis=1)
+ 0 [1, 2]
+ 1 [1, 2]
+ 2 [1, 2]
+ dtype: object
+
+ Passing result_type='expand' will expand list-like results
+ to columns of a Dataframe
+
+ >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
+ 0 1
+ 0 1 2
+ 1 1 2
+ 2 1 2
+
+ Returning a Series inside the function is similar to passing
+ ``result_type='expand'``. The resulting column names
+ will be the Series index.
+
+ >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
+ foo bar
+ 0 1 2
+ 1 1 2
+ 2 1 2
+
+ Passing ``result_type='broadcast'`` will ensure the same shape
+ result, whether list-like or scalar is returned by the function,
+ and broadcast it along the axis. The resulting column names will
+ be the originals.
+
+ >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
+ A B
+ 0 1 2
+ 1 1 2
+ 2 1 2
+ """
+ from pandas.core.apply import frame_apply
+
+ op = frame_apply(
+ self,
+ func=func,
+ axis=axis,
+ broadcast=broadcast,
+ raw=raw,
+ reduce=reduce,
+ result_type=result_type,
+ args=args,
+ kwds=kwds,
+ )
+ return op.get_result()
+
+ def applymap(self, func):
+ """
+ Apply a function to a Dataframe elementwise.
+
+ This method applies a function that accepts and returns a scalar
+ to every element of a DataFrame.
+
+ Parameters
+ ----------
+ func : callable
+ Python function, returns a single value from a single value.
+
+ Returns
+ -------
+ DataFrame
+ Transformed DataFrame.
+
+ See Also
+ --------
+ DataFrame.apply : Apply a function along input axis of DataFrame.
+
+ Notes
+ -----
+ In the current implementation applymap calls `func` twice on the
+ first column/row to decide whether it can take a fast or slow
+ code path. This can lead to unexpected behavior if `func` has
+ side-effects, as they will take effect twice for the first
+ column/row.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
+ >>> df
+ 0 1
+ 0 1.000 2.120
+ 1 3.356 4.567
+
+ >>> df.applymap(lambda x: len(str(x)))
+ 0 1
+ 0 3 4
+ 1 5 5
+
+ Note that a vectorized version of `func` often exists, which will
+ be much faster. You could square each number elementwise.
+
+ >>> df.applymap(lambda x: x**2)
+ 0 1
+ 0 1.000000 4.494400
+ 1 11.262736 20.857489
+
+ But it's better to avoid applymap in that case.
+
+ >>> df ** 2
+ 0 1
+ 0 1.000000 4.494400
+ 1 11.262736 20.857489
+ """
+
+ # if we have a dtype == 'M8[ns]', provide boxed values
+ def infer(x):
+ if x.empty:
+ return lib.map_infer(x, func)
+ return lib.map_infer(x.astype(object).values, func)
+
+ return self.apply(infer)
+
+ # ----------------------------------------------------------------------
+ # Merging / joining methods
+
+ def append(self, other, ignore_index=False, verify_integrity=False, sort=None):
+ """
+ Append rows of `other` to the end of caller, returning a new object.
+
+ Columns in `other` that are not in the caller are added as new columns.
+
+ Parameters
+ ----------
+ other : DataFrame or Series/dict-like object, or list of these
+ The data to append.
+ ignore_index : boolean, default False
+ If True, do not use the index labels.
+ verify_integrity : boolean, default False
+ If True, raise ValueError on creating index with duplicates.
+ sort : boolean, default None
+ Sort columns if the columns of `self` and `other` are not aligned.
+ The default sorting is deprecated and will change to not-sorting
+ in a future version of pandas. Explicitly pass ``sort=True`` to
+ silence the warning and sort. Explicitly pass ``sort=False`` to
+ silence the warning and not sort.
+
+ .. versionadded:: 0.23.0
+
+ Returns
+ -------
+ DataFrame
+
+ See Also
+ --------
+ concat : General function to concatenate DataFrame or Series objects.
+
+ Notes
+ -----
+ If a list of dict/series is passed and the keys are all contained in
+ the DataFrame's index, the order of the columns in the resulting
+ DataFrame will be unchanged.
+
+ Iteratively appending rows to a DataFrame can be more computationally
+ intensive than a single concatenate. A better solution is to append
+ those rows to a list and then concatenate the list with the original
+ DataFrame all at once.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
+ >>> df
+ A B
+ 0 1 2
+ 1 3 4
+ >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
+ >>> df.append(df2)
+ A B
+ 0 1 2
+ 1 3 4
+ 0 5 6
+ 1 7 8
+
+ With `ignore_index` set to True:
+
+ >>> df.append(df2, ignore_index=True)
+ A B
+ 0 1 2
+ 1 3 4
+ 2 5 6
+ 3 7 8
+
+ The following, while not recommended methods for generating DataFrames,
+ show two ways to generate a DataFrame from multiple data sources.
+
+ Less efficient:
+
+ >>> df = pd.DataFrame(columns=['A'])
+ >>> for i in range(5):
+ ... df = df.append({'A': i}, ignore_index=True)
+ >>> df
+ A
+ 0 0
+ 1 1
+ 2 2
+ 3 3
+ 4 4
+
+ More efficient:
+
+ >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
+ ... ignore_index=True)
+ A
+ 0 0
+ 1 1
+ 2 2
+ 3 3
+ 4 4
+ """
+ if isinstance(other, (Series, dict)):
+ if isinstance(other, dict):
+ other = Series(other)
+ if other.name is None and not ignore_index:
+ raise TypeError(
+ "Can only append a Series if ignore_index=True"
+ " or if the Series has a name"
+ )
+
+ if other.name is None:
+ index = None
+ else:
+ # other must have the same index name as self, otherwise
+ # index name will be reset
+ index = Index([other.name], name=self.index.name)
+
+ idx_diff = other.index.difference(self.columns)
+ try:
+ combined_columns = self.columns.append(idx_diff)
+ except TypeError:
+ combined_columns = self.columns.astype(object).append(idx_diff)
+ other = other.reindex(combined_columns, copy=False)
+ other = DataFrame(
+ other.values.reshape((1, len(other))),
+ index=index,
+ columns=combined_columns,
+ )
+ other = other._convert(datetime=True, timedelta=True)
+ if not self.columns.equals(combined_columns):
+ self = self.reindex(columns=combined_columns)
+ elif isinstance(other, list) and not isinstance(other[0], DataFrame):
+ other = DataFrame(other)
+ if (self.columns.get_indexer(other.columns) >= 0).all():
+ other = other.reindex(columns=self.columns)
+
+ from pandas.core.reshape.concat import concat
+
+ if isinstance(other, (list, tuple)):
+ to_concat = [self] + other
+ else:
+ to_concat = [self, other]
+ return concat(
+ to_concat,
+ ignore_index=ignore_index,
+ verify_integrity=verify_integrity,
+ sort=sort,
+ )
+
+ def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False):
+ """
+ Join columns of another DataFrame.
+
+ Join columns with `other` DataFrame either on index or on a key
+ column. Efficiently join multiple DataFrame objects by index at once by
+ passing a list.
+
+ Parameters
+ ----------
+ other : DataFrame, Series, or list of DataFrame
+ Index should be similar to one of the columns in this one. If a
+ Series is passed, its name attribute must be set, and that will be
+ used as the column name in the resulting joined DataFrame.
+ on : str, list of str, or array-like, optional
+ Column or index level name(s) in the caller to join on the index
+ in `other`, otherwise joins index-on-index. If multiple
+ values given, the `other` DataFrame must have a MultiIndex. Can
+ pass an array as the join key if it is not already contained in
+ the calling DataFrame. Like an Excel VLOOKUP operation.
+ how : {'left', 'right', 'outer', 'inner'}, default 'left'
+ How to handle the operation of the two objects.
+
+ * left: use calling frame's index (or column if on is specified)
+ * right: use `other`'s index.
+ * outer: form union of calling frame's index (or column if on is
+ specified) with `other`'s index, and sort it.
+ lexicographically.
+ * inner: form intersection of calling frame's index (or column if
+ on is specified) with `other`'s index, preserving the order
+ of the calling's one.
+ lsuffix : str, default ''
+ Suffix to use from left frame's overlapping columns.
+ rsuffix : str, default ''
+ Suffix to use from right frame's overlapping columns.
+ sort : bool, default False
+ Order result DataFrame lexicographically by the join key. If False,
+ the order of the join key depends on the join type (how keyword).
+
+ Returns
+ -------
+ DataFrame
+ A dataframe containing columns from both the caller and `other`.
+
+ See Also
+ --------
+ DataFrame.merge : For column(s)-on-columns(s) operations.
+
+ Notes
+ -----
+ Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
+ passing a list of `DataFrame` objects.
+
+ Support for specifying index levels as the `on` parameter was added
+ in version 0.23.0.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
+ ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
+
+ >>> df
+ key A
+ 0 K0 A0
+ 1 K1 A1
+ 2 K2 A2
+ 3 K3 A3
+ 4 K4 A4
+ 5 K5 A5
+
+ >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
+ ... 'B': ['B0', 'B1', 'B2']})
+
+ >>> other
+ key B
+ 0 K0 B0
+ 1 K1 B1
+ 2 K2 B2
+
+ Join DataFrames using their indexes.
+
+ >>> df.join(other, lsuffix='_caller', rsuffix='_other')
+ key_caller A key_other B
+ 0 K0 A0 K0 B0
+ 1 K1 A1 K1 B1
+ 2 K2 A2 K2 B2
+ 3 K3 A3 NaN NaN
+ 4 K4 A4 NaN NaN
+ 5 K5 A5 NaN NaN
+
+ If we want to join using the key columns, we need to set key to be
+ the index in both `df` and `other`. The joined DataFrame will have
+ key as its index.
+
+ >>> df.set_index('key').join(other.set_index('key'))
+ A B
+ key
+ K0 A0 B0
+ K1 A1 B1
+ K2 A2 B2
+ K3 A3 NaN
+ K4 A4 NaN
+ K5 A5 NaN
+
+ Another option to join using the key columns is to use the `on`
+ parameter. DataFrame.join always uses `other`'s index but we can use
+ any column in `df`. This method preserves the original DataFrame's
+ index in the result.
+
+ >>> df.join(other.set_index('key'), on='key')
+ key A B
+ 0 K0 A0 B0
+ 1 K1 A1 B1
+ 2 K2 A2 B2
+ 3 K3 A3 NaN
+ 4 K4 A4 NaN
+ 5 K5 A5 NaN
+ """
+ # For SparseDataFrame's benefit
+ return self._join_compat(
+ other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort
+ )
+
+ def _join_compat(
+ self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
+ ):
+ from pandas.core.reshape.merge import merge
+ from pandas.core.reshape.concat import concat
+
+ if isinstance(other, Series):
+ if other.name is None:
+ raise ValueError("Other Series must have a name")
+ other = DataFrame({other.name: other})
+
+ if isinstance(other, DataFrame):
+ return merge(
+ self,
+ other,
+ left_on=on,
+ how=how,
+ left_index=on is None,
+ right_index=True,
+ suffixes=(lsuffix, rsuffix),
+ sort=sort,
+ )
+ else:
+ if on is not None:
+ raise ValueError(
+ "Joining multiple DataFrames only supported" " for joining on index"
+ )
+
+ frames = [self] + list(other)
+
+ can_concat = all(df.index.is_unique for df in frames)
+
+ # join indexes only using concat
+ if can_concat:
+ if how == "left":
+ res = concat(frames, axis=1, join="outer", verify_integrity=True)
+ return res.reindex(self.index, copy=False)
+ else:
+ return concat(frames, axis=1, join=how, verify_integrity=True)
+
+ joined = frames[0]
+
+ for frame in frames[1:]:
+ joined = merge(
+ joined, frame, how=how, left_index=True, right_index=True
+ )
+
+ return joined
+
+ @Substitution("")
+ @Appender(_merge_doc, indents=2)
+ def merge(
+ self,
+ right,
+ how="inner",
+ on=None,
+ left_on=None,
+ right_on=None,
+ left_index=False,
+ right_index=False,
+ sort=False,
+ suffixes=("_x", "_y"),
+ copy=True,
+ indicator=False,
+ validate=None,
+ ):
+ from pandas.core.reshape.merge import merge
+
+ return merge(
+ self,
+ right,
+ how=how,
+ on=on,
+ left_on=left_on,
+ right_on=right_on,
+ left_index=left_index,
+ right_index=right_index,
+ sort=sort,
+ suffixes=suffixes,
+ copy=copy,
+ indicator=indicator,
+ validate=validate,
+ )
+
+ def round(self, decimals=0, *args, **kwargs):
+ """
+ Round a DataFrame to a variable number of decimal places.
+
+ Parameters
+ ----------
+ decimals : int, dict, Series
+ Number of decimal places to round each column to. If an int is
+ given, round each column to the same number of places.
+ Otherwise dict and Series round to variable numbers of places.
+ Column names should be in the keys if `decimals` is a
+ dict-like, or in the index if `decimals` is a Series. Any
+ columns not included in `decimals` will be left as is. Elements
+ of `decimals` which are not columns of the input will be
+ ignored.
+ *args
+ Additional keywords have no effect but might be accepted for
+ compatibility with numpy.
+ **kwargs
+ Additional keywords have no effect but might be accepted for
+ compatibility with numpy.
+
+ Returns
+ -------
+ DataFrame
+ A DataFrame with the affected columns rounded to the specified
+ number of decimal places.
+
+ See Also
+ --------
+ numpy.around : Round a numpy array to the given number of decimals.
+ Series.round : Round a Series to the given number of decimals.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
+ ... columns=['dogs', 'cats'])
+ >>> df
+ dogs cats
+ 0 0.21 0.32
+ 1 0.01 0.67
+ 2 0.66 0.03
+ 3 0.21 0.18
+
+ By providing an integer each column is rounded to the same number
+ of decimal places
+
+ >>> df.round(1)
+ dogs cats
+ 0 0.2 0.3
+ 1 0.0 0.7
+ 2 0.7 0.0
+ 3 0.2 0.2
+
+ With a dict, the number of places for specific columns can be
+ specified with the column names as key and the number of decimal
+ places as value
+
+ >>> df.round({'dogs': 1, 'cats': 0})
+ dogs cats
+ 0 0.2 0.0
+ 1 0.0 1.0
+ 2 0.7 0.0
+ 3 0.2 0.0
+
+ Using a Series, the number of places for specific columns can be
+ specified with the column names as index and the number of
+ decimal places as value
+
+ >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
+ >>> df.round(decimals)
+ dogs cats
+ 0 0.2 0.0
+ 1 0.0 1.0
+ 2 0.7 0.0
+ 3 0.2 0.0
+ """
+ from pandas.core.reshape.concat import concat
+
+ def _dict_round(df, decimals):
+ for col, vals in df.items():
+ try:
+ yield _series_round(vals, decimals[col])
+ except KeyError:
+ yield vals
+
+ def _series_round(s, decimals):
+ if is_integer_dtype(s) or is_float_dtype(s):
+ return s.round(decimals)
+ return s
+
+ nv.validate_round(args, kwargs)
+
+ if isinstance(decimals, (dict, Series)):
+ if isinstance(decimals, Series):
+ if not decimals.index.is_unique:
+ raise ValueError("Index of decimals must be unique")
+ new_cols = [col for col in _dict_round(self, decimals)]
+ elif is_integer(decimals):
+ # Dispatch to Series.round
+ new_cols = [_series_round(v, decimals) for _, v in self.items()]
+ else:
+ raise TypeError("decimals must be an integer, a dict-like or a " "Series")
+
+ if len(new_cols) > 0:
+ return self._constructor(
+ concat(new_cols, axis=1), index=self.index, columns=self.columns
+ )
+ else:
+ return self
+
+ # ----------------------------------------------------------------------
+ # Statistical methods, etc.
+
+ def corr(self, method="pearson", min_periods=1):
+ """
+ Compute pairwise correlation of columns, excluding NA/null values.
+
+ Parameters
+ ----------
+ method : {'pearson', 'kendall', 'spearman'} or callable
+ * pearson : standard correlation coefficient
+ * kendall : Kendall Tau correlation coefficient
+ * spearman : Spearman rank correlation
+ * callable: callable with input two 1d ndarrays
+ and returning a float. Note that the returned matrix from corr
+ will have 1 along the diagonals and will be symmetric
+ regardless of the callable's behavior
+ .. versionadded:: 0.24.0
+
+ min_periods : int, optional
+ Minimum number of observations required per pair of columns
+ to have a valid result. Currently only available for Pearson
+ and Spearman correlation.
+
+ Returns
+ -------
+ DataFrame
+ Correlation matrix.
+
+ See Also
+ --------
+ DataFrame.corrwith
+ Series.corr
+
+ Examples
+ --------
+ >>> def histogram_intersection(a, b):
+ ... v = np.minimum(a, b).sum().round(decimals=1)
+ ... return v
+ >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
+ ... columns=['dogs', 'cats'])
+ >>> df.corr(method=histogram_intersection)
+ dogs cats
+ dogs 1.0 0.3
+ cats 0.3 1.0
+ """
+ numeric_df = self._get_numeric_data()
+ cols = numeric_df.columns
+ idx = cols.copy()
+ mat = numeric_df.values
+
+ if method == "pearson":
+ correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
+ elif method == "spearman":
+ correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods)
+ elif method == "kendall" or callable(method):
+ if min_periods is None:
+ min_periods = 1
+ mat = ensure_float64(mat).T
+ corrf = nanops.get_corr_func(method)
+ K = len(cols)
+ correl = np.empty((K, K), dtype=float)
+ mask = np.isfinite(mat)
+ for i, ac in enumerate(mat):
+ for j, bc in enumerate(mat):
+ if i > j:
+ continue
+
+ valid = mask[i] & mask[j]
+ if valid.sum() < min_periods:
+ c = np.nan
+ elif i == j:
+ c = 1.0
+ elif not valid.all():
+ c = corrf(ac[valid], bc[valid])
+ else:
+ c = corrf(ac, bc)
+ correl[i, j] = c
+ correl[j, i] = c
+ else:
+ raise ValueError(
+ "method must be either 'pearson', "
+ "'spearman', 'kendall', or a callable, "
+ "'{method}' was supplied".format(method=method)
+ )
+
+ return self._constructor(correl, index=idx, columns=cols)
+
+ def cov(self, min_periods=None):
+ """
+ Compute pairwise covariance of columns, excluding NA/null values.
+
+ Compute the pairwise covariance among the series of a DataFrame.
+ The returned data frame is the `covariance matrix
+ <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
+ of the DataFrame.
+
+ Both NA and null values are automatically excluded from the
+ calculation. (See the note below about bias from missing values.)
+ A threshold can be set for the minimum number of
+ observations for each value created. Comparisons with observations
+ below this threshold will be returned as ``NaN``.
+
+ This method is generally used for the analysis of time series data to
+ understand the relationship between different measures
+ across time.
+
+ Parameters
+ ----------
+ min_periods : int, optional
+ Minimum number of observations required per pair of columns
+ to have a valid result.
+
+ Returns
+ -------
+ DataFrame
+ The covariance matrix of the series of the DataFrame.
+
+ See Also
+ --------
+ Series.cov : Compute covariance with another Series.
+ core.window.EWM.cov: Exponential weighted sample covariance.
+ core.window.Expanding.cov : Expanding sample covariance.
+ core.window.Rolling.cov : Rolling sample covariance.
+
+ Notes
+ -----
+ Returns the covariance matrix of the DataFrame's time series.
+ The covariance is normalized by N-1.
+
+ For DataFrames that have Series that are missing data (assuming that
+ data is `missing at random
+ <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
+ the returned covariance matrix will be an unbiased estimate
+ of the variance and covariance between the member Series.
+
+ However, for many applications this estimate may not be acceptable
+ because the estimate covariance matrix is not guaranteed to be positive
+ semi-definite. This could lead to estimate correlations having
+ absolute values which are greater than one, and/or a non-invertible
+ covariance matrix. See `Estimation of covariance matrices
+ <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
+ matrices>`__ for more details.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
+ ... columns=['dogs', 'cats'])
+ >>> df.cov()
+ dogs cats
+ dogs 0.666667 -1.000000
+ cats -1.000000 1.666667
+
+ >>> np.random.seed(42)
+ >>> df = pd.DataFrame(np.random.randn(1000, 5),
+ ... columns=['a', 'b', 'c', 'd', 'e'])
+ >>> df.cov()
+ a b c d e
+ a 0.998438 -0.020161 0.059277 -0.008943 0.014144
+ b -0.020161 1.059352 -0.008543 -0.024738 0.009826
+ c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
+ d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
+ e 0.014144 0.009826 -0.000271 -0.013692 0.977795
+
+ **Minimum number of periods**
+
+ This method also supports an optional ``min_periods`` keyword
+ that specifies the required minimum number of non-NA observations for
+ each column pair in order to have a valid result:
+
+ >>> np.random.seed(42)
+ >>> df = pd.DataFrame(np.random.randn(20, 3),
+ ... columns=['a', 'b', 'c'])
+ >>> df.loc[df.index[:5], 'a'] = np.nan
+ >>> df.loc[df.index[5:10], 'b'] = np.nan
+ >>> df.cov(min_periods=12)
+ a b c
+ a 0.316741 NaN -0.150812
+ b NaN 1.248003 0.191417
+ c -0.150812 0.191417 0.895202
+ """
+ numeric_df = self._get_numeric_data()
+ cols = numeric_df.columns
+ idx = cols.copy()
+ mat = numeric_df.values
+
+ if notna(mat).all():
+ if min_periods is not None and min_periods > len(mat):
+ baseCov = np.empty((mat.shape[1], mat.shape[1]))
+ baseCov.fill(np.nan)
+ else:
+ baseCov = np.cov(mat.T)
+ baseCov = baseCov.reshape((len(cols), len(cols)))
+ else:
+ baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods)
+
+ return self._constructor(baseCov, index=idx, columns=cols)
+
+ def corrwith(self, other, axis=0, drop=False, method="pearson"):
+ """
+ Compute pairwise correlation between rows or columns of DataFrame
+ with rows or columns of Series or DataFrame. DataFrames are first
+ aligned along both axes before computing the correlations.
+
+ Parameters
+ ----------
+ other : DataFrame, Series
+ Object with which to compute correlations.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise.
+ drop : bool, default False
+ Drop missing indices from result.
+ method : {'pearson', 'kendall', 'spearman'} or callable
+ * pearson : standard correlation coefficient
+ * kendall : Kendall Tau correlation coefficient
+ * spearman : Spearman rank correlation
+ * callable: callable with input two 1d ndarrays
+ and returning a float
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ Series
+ Pairwise correlations.
+
+ See Also
+ --------
+ DataFrame.corr
+ """
+ axis = self._get_axis_number(axis)
+ this = self._get_numeric_data()
+
+ if isinstance(other, Series):
+ return this.apply(lambda x: other.corr(x, method=method), axis=axis)
+
+ other = other._get_numeric_data()
+ left, right = this.align(other, join="inner", copy=False)
+
+ if axis == 1:
+ left = left.T
+ right = right.T
+
+ if method == "pearson":
+ # mask missing values
+ left = left + right * 0
+ right = right + left * 0
+
+ # demeaned data
+ ldem = left - left.mean()
+ rdem = right - right.mean()
+
+ num = (ldem * rdem).sum()
+ dom = (left.count() - 1) * left.std() * right.std()
+
+ correl = num / dom
+
+ elif method in ["kendall", "spearman"] or callable(method):
+
+ def c(x):
+ return nanops.nancorr(x[0], x[1], method=method)
+
+ correl = Series(
+ map(c, zip(left.values.T, right.values.T)), index=left.columns
+ )
+
+ else:
+ raise ValueError(
+ "Invalid method {method} was passed, "
+ "valid methods are: 'pearson', 'kendall', "
+ "'spearman', or callable".format(method=method)
+ )
+
+ if not drop:
+ # Find non-matching labels along the given axis
+ # and append missing correlations (GH 22375)
+ raxis = 1 if axis == 0 else 0
+ result_index = this._get_axis(raxis).union(other._get_axis(raxis))
+ idx_diff = result_index.difference(correl.index)
+
+ if len(idx_diff) > 0:
+ correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff))
+
+ return correl
+
+ # ----------------------------------------------------------------------
+ # ndarray-like stats methods
+
+ def count(self, axis=0, level=None, numeric_only=False):
+ """
+ Count non-NA cells for each column or row.
+
+ The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
+ on `pandas.options.mode.use_inf_as_na`) are considered NA.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ If 0 or 'index' counts are generated for each column.
+ If 1 or 'columns' counts are generated for each **row**.
+ level : int or str, optional
+ If the axis is a `MultiIndex` (hierarchical), count along a
+ particular `level`, collapsing into a `DataFrame`.
+ A `str` specifies the level name.
+ numeric_only : bool, default False
+ Include only `float`, `int` or `boolean` data.
+
+ Returns
+ -------
+ Series or DataFrame
+ For each column/row the number of non-NA/null entries.
+ If `level` is specified returns a `DataFrame`.
+
+ See Also
+ --------
+ Series.count: Number of non-NA elements in a Series.
+ DataFrame.shape: Number of DataFrame rows and columns (including NA
+ elements).
+ DataFrame.isna: Boolean same-sized DataFrame showing places of NA
+ elements.
+
+ Examples
+ --------
+ Constructing DataFrame from a dictionary:
+
+ >>> df = pd.DataFrame({"Person":
+ ... ["John", "Myla", "Lewis", "John", "Myla"],
+ ... "Age": [24., np.nan, 21., 33, 26],
+ ... "Single": [False, True, True, True, False]})
+ >>> df
+ Person Age Single
+ 0 John 24.0 False
+ 1 Myla NaN True
+ 2 Lewis 21.0 True
+ 3 John 33.0 True
+ 4 Myla 26.0 False
+
+ Notice the uncounted NA values:
+
+ >>> df.count()
+ Person 5
+ Age 4
+ Single 5
+ dtype: int64
+
+ Counts for each **row**:
+
+ >>> df.count(axis='columns')
+ 0 3
+ 1 2
+ 2 3
+ 3 3
+ 4 3
+ dtype: int64
+
+ Counts for one level of a `MultiIndex`:
+
+ >>> df.set_index(["Person", "Single"]).count(level="Person")
+ Age
+ Person
+ John 2
+ Lewis 1
+ Myla 1
+ """
+ axis = self._get_axis_number(axis)
+ if level is not None:
+ return self._count_level(level, axis=axis, numeric_only=numeric_only)
+
+ if numeric_only:
+ frame = self._get_numeric_data()
+ else:
+ frame = self
+
+ # GH #423
+ if len(frame._get_axis(axis)) == 0:
+ result = Series(0, index=frame._get_agg_axis(axis))
+ else:
+ if frame._is_mixed_type or frame._data.any_extension_types:
+ # the or any_extension_types is really only hit for single-
+ # column frames with an extension array
+ result = notna(frame).sum(axis=axis)
+ else:
+ # GH13407
+ series_counts = notna(frame).sum(axis=axis)
+ counts = series_counts.values
+ result = Series(counts, index=frame._get_agg_axis(axis))
+
+ return result.astype("int64")
+
+ def _count_level(self, level, axis=0, numeric_only=False):
+ if numeric_only:
+ frame = self._get_numeric_data()
+ else:
+ frame = self
+
+ count_axis = frame._get_axis(axis)
+ agg_axis = frame._get_agg_axis(axis)
+
+ if not isinstance(count_axis, MultiIndex):
+ raise TypeError(
+ "Can only count levels on hierarchical "
+ "{ax}.".format(ax=self._get_axis_name(axis))
+ )
+
+ if frame._is_mixed_type:
+ # Since we have mixed types, calling notna(frame.values) might
+ # upcast everything to object
+ mask = notna(frame).values
+ else:
+ # But use the speedup when we have homogeneous dtypes
+ mask = notna(frame.values)
+
+ if axis == 1:
+ # We're transposing the mask rather than frame to avoid potential
+ # upcasts to object, which induces a ~20x slowdown
+ mask = mask.T
+
+ if isinstance(level, str):
+ level = count_axis._get_level_number(level)
+
+ level_index = count_axis.levels[level]
+ level_codes = ensure_int64(count_axis.codes[level])
+ counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0)
+
+ result = DataFrame(counts, index=level_index, columns=agg_axis)
+
+ if axis == 1:
+ # Undo our earlier transpose
+ return result.T
+ else:
+ return result
+
+ def _reduce(
+ self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
+ ):
+ if axis is None and filter_type == "bool":
+ labels = None
+ constructor = None
+ else:
+ # TODO: Make other agg func handle axis=None properly
+ axis = self._get_axis_number(axis)
+ labels = self._get_agg_axis(axis)
+ constructor = self._constructor
+
+ def f(x):
+ return op(x, axis=axis, skipna=skipna, **kwds)
+
+ # exclude timedelta/datetime unless we are uniform types
+ if (
+ axis == 1
+ and self._is_datelike_mixed_type
+ and (
+ not self._is_homogeneous_type
+ and not is_datetime64tz_dtype(self.dtypes[0])
+ )
+ ):
+ numeric_only = True
+
+ if numeric_only is None:
+ try:
+ values = self.values
+ result = f(values)
+
+ if filter_type == "bool" and is_object_dtype(values) and axis is None:
+ # work around https://github.com/numpy/numpy/issues/10489
+ # TODO: combine with hasattr(result, 'dtype') further down
+ # hard since we don't have `values` down there.
+ result = np.bool_(result)
+ except Exception as e:
+
+ # try by-column first
+ if filter_type is None and axis == 0:
+ try:
+
+ # this can end up with a non-reduction
+ # but not always. if the types are mixed
+ # with datelike then need to make sure a series
+
+ # we only end up here if we have not specified
+ # numeric_only and yet we have tried a
+ # column-by-column reduction, where we have mixed type.
+ # So let's just do what we can
+ from pandas.core.apply import frame_apply
+
+ opa = frame_apply(
+ self, func=f, result_type="expand", ignore_failures=True
+ )
+ result = opa.get_result()
+ if result.ndim == self.ndim:
+ result = result.iloc[0]
+ return result
+ except Exception:
+ pass
+
+ if filter_type is None or filter_type == "numeric":
+ data = self._get_numeric_data()
+ elif filter_type == "bool":
+ data = self._get_bool_data()
+ else: # pragma: no cover
+ e = NotImplementedError(
+ "Handling exception with filter_type {f} not"
+ "implemented.".format(f=filter_type)
+ )
+ raise_with_traceback(e)
+ with np.errstate(all="ignore"):
+ result = f(data.values)
+ labels = data._get_agg_axis(axis)
+ else:
+ if numeric_only:
+ if filter_type is None or filter_type == "numeric":
+ data = self._get_numeric_data()
+ elif filter_type == "bool":
+ # GH 25101, # GH 24434
+ data = self._get_bool_data() if axis == 0 else self
+ else: # pragma: no cover
+ msg = (
+ "Generating numeric_only data with filter_type {f}"
+ "not supported.".format(f=filter_type)
+ )
+ raise NotImplementedError(msg)
+ values = data.values
+ labels = data._get_agg_axis(axis)
+ else:
+ values = self.values
+ result = f(values)
+
+ if hasattr(result, "dtype") and is_object_dtype(result.dtype):
+ try:
+ if filter_type is None or filter_type == "numeric":
+ result = result.astype(np.float64)
+ elif filter_type == "bool" and notna(result).all():
+ result = result.astype(np.bool_)
+ except (ValueError, TypeError):
+
+ # try to coerce to the original dtypes item by item if we can
+ if axis == 0:
+ result = coerce_to_dtypes(result, self.dtypes)
+
+ if constructor is not None:
+ result = Series(result, index=labels)
+ return result
+
+ def nunique(self, axis=0, dropna=True):
+ """
+ Count distinct observations over requested axis.
+
+ Return Series with number of distinct observations. Can ignore NaN
+ values.
+
+ .. versionadded:: 0.20.0
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
+ column-wise.
+ dropna : bool, default True
+ Don't include NaN in the counts.
+
+ Returns
+ -------
+ Series
+
+ See Also
+ --------
+ Series.nunique: Method nunique for Series.
+ DataFrame.count: Count non-NA cells for each column or row.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
+ >>> df.nunique()
+ A 3
+ B 1
+ dtype: int64
+
+ >>> df.nunique(axis=1)
+ 0 1
+ 1 2
+ 2 2
+ dtype: int64
+ """
+ return self.apply(Series.nunique, axis=axis, dropna=dropna)
+
+ def idxmin(self, axis=0, skipna=True):
+ """
+ Return index of first occurrence of minimum over requested axis.
+ NA/null values are excluded.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ 0 or 'index' for row-wise, 1 or 'columns' for column-wise
+ skipna : boolean, default True
+ Exclude NA/null values. If an entire row/column is NA, the result
+ will be NA.
+
+ Returns
+ -------
+ Series
+ Indexes of minima along the specified axis.
+
+ Raises
+ ------
+ ValueError
+ * If the row/column is empty
+
+ See Also
+ --------
+ Series.idxmin
+
+ Notes
+ -----
+ This method is the DataFrame version of ``ndarray.argmin``.
+ """
+ axis = self._get_axis_number(axis)
+ indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
+ index = self._get_axis(axis)
+ result = [index[i] if i >= 0 else np.nan for i in indices]
+ return Series(result, index=self._get_agg_axis(axis))
+
+ def idxmax(self, axis=0, skipna=True):
+ """
+ Return index of first occurrence of maximum over requested axis.
+ NA/null values are excluded.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ 0 or 'index' for row-wise, 1 or 'columns' for column-wise
+ skipna : boolean, default True
+ Exclude NA/null values. If an entire row/column is NA, the result
+ will be NA.
+
+ Returns
+ -------
+ Series
+ Indexes of maxima along the specified axis.
+
+ Raises
+ ------
+ ValueError
+ * If the row/column is empty
+
+ See Also
+ --------
+ Series.idxmax
+
+ Notes
+ -----
+ This method is the DataFrame version of ``ndarray.argmax``.
+ """
+ axis = self._get_axis_number(axis)
+ indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
+ index = self._get_axis(axis)
+ result = [index[i] if i >= 0 else np.nan for i in indices]
+ return Series(result, index=self._get_agg_axis(axis))
+
+ def _get_agg_axis(self, axis_num):
+ """
+ Let's be explicit about this.
+ """
+ if axis_num == 0:
+ return self.columns
+ elif axis_num == 1:
+ return self.index
+ else:
+ raise ValueError("Axis must be 0 or 1 (got %r)" % axis_num)
+
+ def mode(self, axis=0, numeric_only=False, dropna=True):
+ """
+ Get the mode(s) of each element along the selected axis.
+
+ The mode of a set of values is the value that appears most often.
+ It can be multiple values.
+
+ Parameters
+ ----------
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to iterate over while searching for the mode:
+
+ * 0 or 'index' : get mode of each column
+ * 1 or 'columns' : get mode of each row
+ numeric_only : bool, default False
+ If True, only apply to numeric columns.
+ dropna : bool, default True
+ Don't consider counts of NaN/NaT.
+
+ .. versionadded:: 0.24.0
+
+ Returns
+ -------
+ DataFrame
+ The modes of each column or row.
+
+ See Also
+ --------
+ Series.mode : Return the highest frequency value in a Series.
+ Series.value_counts : Return the counts of values in a Series.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame([('bird', 2, 2),
+ ... ('mammal', 4, np.nan),
+ ... ('arthropod', 8, 0),
+ ... ('bird', 2, np.nan)],
+ ... index=('falcon', 'horse', 'spider', 'ostrich'),
+ ... columns=('species', 'legs', 'wings'))
+ >>> df
+ species legs wings
+ falcon bird 2 2.0
+ horse mammal 4 NaN
+ spider arthropod 8 0.0
+ ostrich bird 2 NaN
+
+ By default, missing values are not considered, and the mode of wings
+ are both 0 and 2. The second row of species and legs contains ``NaN``,
+ because they have only one mode, but the DataFrame has two rows.
+
+ >>> df.mode()
+ species legs wings
+ 0 bird 2.0 0.0
+ 1 NaN NaN 2.0
+
+ Setting ``dropna=False`` ``NaN`` values are considered and they can be
+ the mode (like for wings).
+
+ >>> df.mode(dropna=False)
+ species legs wings
+ 0 bird 2 NaN
+
+ Setting ``numeric_only=True``, only the mode of numeric columns is
+ computed, and columns of other types are ignored.
+
+ >>> df.mode(numeric_only=True)
+ legs wings
+ 0 2.0 0.0
+ 1 NaN 2.0
+
+ To compute the mode over columns and not rows, use the axis parameter:
+
+ >>> df.mode(axis='columns', numeric_only=True)
+ 0 1
+ falcon 2.0 NaN
+ horse 4.0 NaN
+ spider 0.0 8.0
+ ostrich 2.0 NaN
+ """
+ data = self if not numeric_only else self._get_numeric_data()
+
+ def f(s):
+ return s.mode(dropna=dropna)
+
+ return data.apply(f, axis=axis)
+
+ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
+ """
+ Return values at the given quantile over requested axis.
+
+ Parameters
+ ----------
+ q : float or array-like, default 0.5 (50% quantile)
+ Value between 0 <= q <= 1, the quantile(s) to compute.
+ axis : {0, 1, 'index', 'columns'} (default 0)
+ Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+ numeric_only : bool, default True
+ If False, the quantile of datetime and timedelta data will be
+ computed as well.
+ interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+ This optional parameter specifies the interpolation method to use,
+ when the desired quantile lies between two data points `i` and `j`:
+
+ * linear: `i + (j - i) * fraction`, where `fraction` is the
+ fractional part of the index surrounded by `i` and `j`.
+ * lower: `i`.
+ * higher: `j`.
+ * nearest: `i` or `j` whichever is nearest.
+ * midpoint: (`i` + `j`) / 2.
+
+ .. versionadded:: 0.18.0
+
+ Returns
+ -------
+ Series or DataFrame
+
+ If ``q`` is an array, a DataFrame will be returned where the
+ index is ``q``, the columns are the columns of self, and the
+ values are the quantiles.
+ If ``q`` is a float, a Series will be returned where the
+ index is the columns of self and the values are the quantiles.
+
+ See Also
+ --------
+ core.window.Rolling.quantile: Rolling quantile.
+ numpy.percentile: Numpy function to compute the percentile.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
+ ... columns=['a', 'b'])
+ >>> df.quantile(.1)
+ a 1.3
+ b 3.7
+ Name: 0.1, dtype: float64
+ >>> df.quantile([.1, .5])
+ a b
+ 0.1 1.3 3.7
+ 0.5 2.5 55.0
+
+ Specifying `numeric_only=False` will also compute the quantile of
+ datetime and timedelta data.
+
+ >>> df = pd.DataFrame({'A': [1, 2],
+ ... 'B': [pd.Timestamp('2010'),
+ ... pd.Timestamp('2011')],
+ ... 'C': [pd.Timedelta('1 days'),
+ ... pd.Timedelta('2 days')]})
+ >>> df.quantile(0.5, numeric_only=False)
+ A 1.5
+ B 2010-07-02 12:00:00
+ C 1 days 12:00:00
+ Name: 0.5, dtype: object
+ """
+ self._check_percentile(q)
+
+ data = self._get_numeric_data() if numeric_only else self
+ axis = self._get_axis_number(axis)
+ is_transposed = axis == 1
+
+ if is_transposed:
+ data = data.T
+
+ result = data._data.quantile(
+ qs=q, axis=1, interpolation=interpolation, transposed=is_transposed
+ )
+
+ if result.ndim == 2:
+ result = self._constructor(result)
+ else:
+ result = self._constructor_sliced(result, name=q)
+
+ if is_transposed:
+ result = result.T
+
+ return result
+
+ def to_timestamp(self, freq=None, how="start", axis=0, copy=True):
+ """
+ Cast to DatetimeIndex of timestamps, at *beginning* of period.
+
+ Parameters
+ ----------
+ freq : str, default frequency of PeriodIndex
+ Desired frequency.
+ how : {'s', 'e', 'start', 'end'}
+ Convention for converting period to timestamp; start of period
+ vs. end.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to convert (the index by default).
+ copy : bool, default True
+ If False then underlying input data is not copied.
+
+ Returns
+ -------
+ DataFrame with DatetimeIndex
+ """
+ new_data = self._data
+ if copy:
+ new_data = new_data.copy()
+
+ axis = self._get_axis_number(axis)
+ if axis == 0:
+ new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))
+ elif axis == 1:
+ new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
+ else: # pragma: no cover
+ raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis))
+
+ return self._constructor(new_data)
+
+ def to_period(self, freq=None, axis=0, copy=True):
+ """
+ Convert DataFrame from DatetimeIndex to PeriodIndex with desired
+ frequency (inferred from index if not passed).
+
+ Parameters
+ ----------
+ freq : str, default
+ Frequency of the PeriodIndex.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ The axis to convert (the index by default).
+ copy : bool, default True
+ If False then underlying input data is not copied.
+
+ Returns
+ -------
+ TimeSeries with PeriodIndex
+ """
+ new_data = self._data
+ if copy:
+ new_data = new_data.copy()
+
+ axis = self._get_axis_number(axis)
+ if axis == 0:
+ new_data.set_axis(1, self.index.to_period(freq=freq))
+ elif axis == 1:
+ new_data.set_axis(0, self.columns.to_period(freq=freq))
+ else: # pragma: no cover
+ raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis))
+
+ return self._constructor(new_data)
+
+ def isin(self, values):
+ """
+ Whether each element in the DataFrame is contained in values.
+
+ Parameters
+ ----------
+ values : iterable, Series, DataFrame or dict
+ The result will only be true at a location if all the
+ labels match. If `values` is a Series, that's the index. If
+ `values` is a dict, the keys must be the column names,
+ which must match. If `values` is a DataFrame,
+ then both the index and column labels must match.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame of booleans showing whether each element in the DataFrame
+ is contained in values.
+
+ See Also
+ --------
+ DataFrame.eq: Equality test for DataFrame.
+ Series.isin: Equivalent method on Series.
+ Series.str.contains: Test if pattern or regex is contained within a
+ string of a Series or Index.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
+ ... index=['falcon', 'dog'])
+ >>> df
+ num_legs num_wings
+ falcon 2 2
+ dog 4 0
+
+ When ``values`` is a list check whether every value in the DataFrame
+ is present in the list (which animals have 0 or 2 legs or wings)
+
+ >>> df.isin([0, 2])
+ num_legs num_wings
+ falcon True True
+ dog False True
+
+ When ``values`` is a dict, we can pass values to check for each
+ column separately:
+
+ >>> df.isin({'num_wings': [0, 3]})
+ num_legs num_wings
+ falcon False False
+ dog False True
+
+ When ``values`` is a Series or DataFrame the index and column must
+ match. Note that 'falcon' does not match based on the number of legs
+ in df2.
+
+ >>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]},
+ ... index=['spider', 'falcon'])
+ >>> df.isin(other)
+ num_legs num_wings
+ falcon True True
+ dog False False
+ """
+ if isinstance(values, dict):
+ from pandas.core.reshape.concat import concat
+
+ values = collections.defaultdict(list, values)
+ return concat(
+ (
+ self.iloc[:, [i]].isin(values[col])
+ for i, col in enumerate(self.columns)
+ ),
+ axis=1,
+ )
+ elif isinstance(values, Series):
+ if not values.index.is_unique:
+ raise ValueError("cannot compute isin with " "a duplicate axis.")
+ return self.eq(values.reindex_like(self), axis="index")
+ elif isinstance(values, DataFrame):
+ if not (values.columns.is_unique and values.index.is_unique):
+ raise ValueError("cannot compute isin with " "a duplicate axis.")
+ return self.eq(values.reindex_like(self))
+ else:
+ if not is_list_like(values):
+ raise TypeError(
+ "only list-like or dict-like objects are "
+ "allowed to be passed to DataFrame.isin(), "
+ "you passed a "
+ "{0!r}".format(type(values).__name__)
+ )
+ return DataFrame(
+ algorithms.isin(self.values.ravel(), values).reshape(self.shape),
+ self.index,
+ self.columns,
+ )
+
+ # ----------------------------------------------------------------------
+ # Add plotting methods to DataFrame
+ plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
+ hist = pandas.plotting.hist_frame
+ boxplot = pandas.plotting.boxplot_frame
+ sparse = CachedAccessor("sparse", SparseFrameAccessor)
+
+
+DataFrame._setup_axes(
+ ["index", "columns"],
+ info_axis=1,
+ stat_axis=0,
+ axes_are_reversed=True,
+ aliases={"rows": 0},
+ docs={
+ "index": "The index (row labels) of the DataFrame.",
+ "columns": "The column labels of the DataFrame.",
+ },
+)
+DataFrame._add_numeric_operations()
+DataFrame._add_series_or_dataframe_operations()
+
+ops.add_flex_arithmetic_methods(DataFrame)
+ops.add_special_arithmetic_methods(DataFrame)
+
+
+def _from_nested_dict(data):
+ # TODO: this should be seriously cythonized
+ new_data = OrderedDict()
+ for index, s in data.items():
+ for col, v in s.items():
+ new_data[col] = new_data.get(col, OrderedDict())
+ new_data[col][index] = v
+ return new_data
+
+
+def _put_str(s, space):
+ return "{s}".format(s=s)[:space].ljust(space)
+