Merge branch 'main' into na_rep-bug

rsm-23 · Sep 27, 2023 · c28ab1a · c28ab1a
2 parents 17bed8a + 61d2056
commit c28ab1a
Show file tree

Hide file tree

Showing 176 changed files with 1,997 additions and 1,068 deletions.
diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
@@ -24,7 +24,7 @@ jobs:
     runs-on: ubuntu-22.04
     strategy:
       matrix:
-        extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "consortium-standard", "all"]
+        extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"]
       fail-fast: false
     name: Install Extras - ${{ matrix.extra }}
     concurrency:

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -138,7 +138,7 @@ jobs:
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.15.0
+        uses: pypa/cibuildwheel@v2.16.0
         with:
          package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,11 +20,11 @@ ci:
 repos:
 -   repo: https://github.com/hauntsaninja/black-pre-commit-mirror
     # black compiled with mypyc
-    rev: 23.7.0
+    rev: 23.9.1
     hooks:
       - id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.287
+    rev: v0.0.291
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -107,7 +107,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.10.1
+    rev: v3.13.0
     hooks:
     -   id: pyupgrade
         args: [--py39-plus]

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -841,6 +841,23 @@ def time_groupby_sum_multiindex(self):
         self.df.groupby(level=[0, 1]).sum()
 
 
+class SumTimeDelta:
+    # GH 20660
+    def setup(self):
+        N = 10**4
+        self.df = DataFrame(
+            np.random.randint(1000, 100000, (N, 100)),
+            index=np.random.randint(200, size=(N,)),
+        ).astype("timedelta64[ns]")
+        self.df_int = self.df.copy().astype("int64")
+
+    def time_groupby_sum_timedelta(self):
+        self.df.groupby(lambda x: x).sum()
+
+    def time_groupby_sum_int(self):
+        self.df_int.groupby(lambda x: x).sum()
+
+
 class Transform:
     def setup(self):
         n1 = 400

diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
@@ -34,7 +34,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   - numba>=0.55.2
   - numexpr>=2.8.0
   - odfpy>=1.4.1

diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
@@ -35,7 +35,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   - numba>=0.55.2
   - numexpr>=2.8.0
   - odfpy>=1.4.1

diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -34,7 +34,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   - numba>=0.55.2
   - numexpr>=2.8.0
   - odfpy>=1.4.1

diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -34,7 +34,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   - numba>=0.55.2
   - numexpr>=2.8.0
   - odfpy>=1.4.1

diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
@@ -34,7 +34,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   # test_numba_vs_cython segfaults with numba 0.57
   - numba>=0.55.2, <0.57.0
   - numexpr>=2.8.0

diff --git a/doc/make.py b/doc/make.py
@@ -123,14 +123,14 @@ def _sphinx_build(self, kind: str):
 
         Parameters
         ----------
-        kind : {'html', 'latex'}
+        kind : {'html', 'latex', 'linkcheck'}
 
         Examples
         --------
         >>> DocBuilder(num_jobs=4)._sphinx_build('html')
         """
-        if kind not in ("html", "latex"):
-            raise ValueError(f"kind must be html or latex, not {kind}")
+        if kind not in ("html", "latex", "linkcheck"):
+            raise ValueError(f"kind must be html, latex or linkcheck, not {kind}")
 
         cmd = ["sphinx-build", "-b", kind]
         if self.num_jobs:
@@ -288,6 +288,12 @@ def zip_html(self):
         os.chdir(dirname)
         self._run_os("zip", zip_fname, "-r", "-q", *fnames)
 
+    def linkcheck(self):
+        """
+        Check for broken links in the documentation.
+        """
+        return self._sphinx_build("linkcheck")
+
 
 def main():
     cmds = [method for method in dir(DocBuilder) if not method.startswith("_")]

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -162,7 +162,7 @@
 # General information about the project.
 project = "pandas"
 # We have our custom "pandas_footer.html" template, using copyright for the current year
-copyright = f"{datetime.now().year}"
+copyright = f"{datetime.now().year},"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -247,14 +247,14 @@ Dependency                                            Minimum Version    pip ext
 Visualization
 ^^^^^^^^^^^^^
 
-Installable with ``pip install "pandas[plot, output_formatting]"``.
+Installable with ``pip install "pandas[plot, output-formatting]"``.
 
 ========================= ================== ================== =============================================================
 Dependency                Minimum Version    pip extra          Notes
 ========================= ================== ================== =============================================================
 matplotlib                3.6.1              plot               Plotting library
-Jinja2                    3.1.2              output_formatting  Conditional formatting with DataFrame.style
-tabulate                  0.8.10             output_formatting  Printing in Markdown-friendly format (see `tabulate`_)
+Jinja2                    3.1.2              output-formatting  Conditional formatting with DataFrame.style
+tabulate                  0.8.10             output-formatting  Printing in Markdown-friendly format (see `tabulate`_)
 ========================= ================== ================== =============================================================
 
 Computation

diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst
@@ -295,7 +295,7 @@ Aggregate the current hourly time series values to the monthly maximum value in
 
 .. ipython:: python
 
-    monthly_max = no_2.resample("M").max()
+    monthly_max = no_2.resample("ME").max()
     monthly_max
 
 A very powerful method on time series data with a datetime index, is the

diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -771,7 +771,7 @@ To create year and month cross tabulation:
 
    df = pd.DataFrame(
        {"value": np.random.randn(36)},
-       index=pd.date_range("2011-01-01", freq="M", periods=36),
+       index=pd.date_range("2011-01-01", freq="ME", periods=36),
    )
 
    pd.pivot_table(
@@ -794,12 +794,12 @@ Apply
        index=["I", "II", "III"],
    )
 
-   def make_df(ser):
-       new_vals = [pd.Series(value, name=name) for name, value in ser.items()]
-       return pd.DataFrame(new_vals)
-
-   df_orgz = pd.concat({ind: row.pipe(make_df) for ind, row in df.iterrows()})
+   def SeriesFromSubList(aList):
+       return pd.Series(aList)
 
+   df_orgz = pd.concat(
+       {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}
+   )
    df_orgz
 
 `Rolling apply with a DataFrame returning a Series

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -1213,6 +1213,19 @@ The dimension of the returned result can also change:
 
     grouped.apply(f)
 
+``apply`` on a Series can operate on a returned value from the applied function
+that is itself a series, and possibly upcast the result to a DataFrame:
+
+.. ipython:: python
+
+    def f(x):
+        return pd.Series([x, x ** 2], index=["x", "x^2"])
+
+
+    s = pd.Series(np.random.rand(5))
+    s
+    s.apply(f)
+
 Similar to :ref:`groupby.aggregate.agg`, the resulting dtype will reflect that of the
 apply function. If the results from different groups have different dtypes, then
 a common dtype will be determined in the same way as ``DataFrame`` construction.
@@ -1403,7 +1416,7 @@ Groupby a specific column with the desired frequency. This is like resampling.
 
 .. ipython:: python
 
-   df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum()
+   df.groupby([pd.Grouper(freq="1ME", key="Date"), "Buyer"])[["Quantity"]].sum()
 
 When ``freq`` is specified, the object returned by ``pd.Grouper`` will be an
 instance of ``pandas.api.typing.TimeGrouper``. You have an ambiguous specification
@@ -1413,9 +1426,9 @@ in that you have a named index and a column that could be potential groupers.
 
    df = df.set_index("Date")
    df["Date"] = df.index + pd.offsets.MonthEnd(2)
-   df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum()
+   df.groupby([pd.Grouper(freq="6ME", key="Date"), "Buyer"])[["Quantity"]].sum()
 
-   df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum()
+   df.groupby([pd.Grouper(freq="6ME", level="Date"), "Buyer"])[["Quantity"]].sum()
 
 
 Taking the first rows of each group

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -1837,7 +1837,7 @@ This however is operating on a copy and will not work.
    :okwarning:
    :okexcept:
 
-   with option_context('mode.chained_assignment','warn'):
+   with pd.option_context('mode.chained_assignment','warn'):
        dfb[dfb['a'].str.startswith('o')]['c'] = 42
 
 A chained assignment can also crop up in setting in a mixed dtype frame.
@@ -1879,7 +1879,7 @@ Last, the subsequent example will **not** work at all, and so should be avoided:
    :okwarning:
    :okexcept:
 
-   with option_context('mode.chained_assignment','raise'):
+   with pd.option_context('mode.chained_assignment','raise'):
        dfd.loc[0]['a'] = 1111
 
 .. warning::

diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -136,7 +136,7 @@ Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For d
 
 .. ipython:: python
 
-   pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C")
+   pd.pivot_table(df, values="D", index=pd.Grouper(freq="ME", key="F"), columns="C")
 
 .. _reshaping.pivot.margins:
 

diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -107,7 +107,7 @@ data however will be stored as ``object`` data.
 
    pd.Series(pd.period_range("1/1/2011", freq="M", periods=3))
    pd.Series([pd.DateOffset(1), pd.DateOffset(2)])
-   pd.Series(pd.date_range("1/1/2011", freq="M", periods=3))
+   pd.Series(pd.date_range("1/1/2011", freq="ME", periods=3))
 
 Lastly, pandas represents null date times, time deltas, and time spans as ``NaT`` which
 is useful for representing missing or null date like values and behaves similar
@@ -450,7 +450,7 @@ variety of :ref:`frequency aliases <timeseries.offset_aliases>`:
 
 .. ipython:: python
 
-   pd.date_range(start, periods=1000, freq="M")
+   pd.date_range(start, periods=1000, freq="ME")
 
    pd.bdate_range(start, periods=250, freq="BQS")
 
@@ -882,7 +882,7 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ
     :class:`~pandas.tseries.offsets.Week`, ``'W'``, "one week, optionally anchored on a day of the week"
     :class:`~pandas.tseries.offsets.WeekOfMonth`, ``'WOM'``, "the x-th day of the y-th week of each month"
     :class:`~pandas.tseries.offsets.LastWeekOfMonth`, ``'LWOM'``, "the x-th day of the last week of each month"
-    :class:`~pandas.tseries.offsets.MonthEnd`, ``'M'``, "calendar month end"
+    :class:`~pandas.tseries.offsets.MonthEnd`, ``'ME'``, "calendar month end"
     :class:`~pandas.tseries.offsets.MonthBegin`, ``'MS'``, "calendar month begin"
     :class:`~pandas.tseries.offsets.BMonthEnd` or :class:`~pandas.tseries.offsets.BusinessMonthEnd`, ``'BM'``, "business month end"
     :class:`~pandas.tseries.offsets.BMonthBegin` or :class:`~pandas.tseries.offsets.BusinessMonthBegin`, ``'BMS'``, "business month begin"
@@ -1246,7 +1246,7 @@ frequencies. We will refer to these aliases as *offset aliases*.
     "C", "custom business day frequency"
     "D", "calendar day frequency"
     "W", "weekly frequency"
-    "M", "month end frequency"
+    "ME", "month end frequency"
     "SM", "semi-month end frequency (15th and end of month)"
     "BM", "business month end frequency"
     "CBM", "custom business month end frequency"
@@ -1690,7 +1690,7 @@ the end of the interval.
 .. warning::
 
     The default values for ``label`` and ``closed`` is '**left**' for all
-    frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W'
+    frequency offsets except for 'ME', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W'
     which all have a default of 'right'.
 
     This might unintendedly lead to looking ahead, where the value for a later
@@ -1856,15 +1856,15 @@ to resample based on datetimelike column in the frame, it can passed to the
        ),
    )
    df
-   df.resample("M", on="date")[["a"]].sum()
+   df.resample("ME", on="date")[["a"]].sum()
 
 Similarly, if you instead want to resample by a datetimelike
 level of ``MultiIndex``, its name or location can be passed to the
 ``level`` keyword.
 
 .. ipython:: python
 
-   df.resample("M", level="d")[["a"]].sum()
+   df.resample("ME", level="d")[["a"]].sum()
 
 .. _timeseries.iterating-label:
 
@@ -2137,7 +2137,7 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th
    pi.astype("datetime64[ns]")
 
    # convert to PeriodIndex
-   dti = pd.date_range("2011-01-01", freq="M", periods=3)
+   dti = pd.date_range("2011-01-01", freq="ME", periods=3)
    dti
    dti.astype("period[M]")
 
@@ -2256,7 +2256,7 @@ and vice-versa using ``to_timestamp``:
 
 .. ipython:: python
 
-   rng = pd.date_range("1/1/2012", periods=5, freq="M")
+   rng = pd.date_range("1/1/2012", periods=5, freq="ME")
 
    ts = pd.Series(np.random.randn(len(rng)), index=rng)
 

diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
@@ -24,6 +24,7 @@ Version 2.1
 .. toctree::
    :maxdepth: 2
 
+   v2.1.2
    v2.1.1
    v2.1.0
 

diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst
@@ -261,26 +261,15 @@ Convenience methods ``ffill`` and  ``bfill`` have been added:
   function, that is itself a series, and possibly upcast the result to a
   DataFrame
 
-  .. code-block:: python
-
-     >>> def f(x):
-     ...     return pd.Series([x, x ** 2], index=["x", "x^2"])
-     >>>
-     >>> s = pd.Series(np.random.rand(5))
-     >>> s
-     0    0.340445
-     1    0.984729
-     2    0.919540
-     3    0.037772
-     4    0.861549
-     dtype: float64
-     >>> s.apply(f)
-               x       x^2
-     0  0.340445  0.115903
-     1  0.984729  0.969691
-     2  0.919540  0.845555
-     3  0.037772  0.001427
-     4  0.861549  0.742267
+  .. ipython:: python
+
+      def f(x):
+          return pd.Series([x, x ** 2], index=["x", "x^2"])
+
+
+      s = pd.Series(np.random.rand(5))
+      s
+      s.apply(f)
 
 - New API functions for working with pandas options (:issue:`2097`):
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,6 +24,7 @@ Version 2.1 @@
     .. toctree::
        :maxdepth: 2
+       v2.1.2
        v2.1.1
        v2.1.0
@@ Expand Down @@