[SPARK-39986][PS][DOC] Better example for Co-grouped Map

### What changes were proposed in this pull request? Currently, pandas `asof_merge` is used to explain Co-grouped Map with the pseudo-code below ``` df1.groupby(..).cogroup(df2.groupby(..)).applyInPandas(asof_join) ``` which may mislead users to think `asof_join` is not supported in Pandas API on Spark, but actually it is supported already. The PR proposes to use the unsupported `merge_sorted` instead. ### Why are the changes needed? Better documents. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual test. Closes apache#37412 from xinrong-meng/better_eg. Authored-by: Xinrong Meng <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
eruizalo · Aug 5, 2022 · 82dc17c · 82dc17c
1 parent e9e8dcb
commit 82dc17c
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 15 deletions.
diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py
@@ -260,19 +260,19 @@ def cogrouped_apply_in_pandas_example(spark: SparkSession) -> None:
         [(20000101, 1, "x"), (20000101, 2, "y")],
         ("time", "id", "v2"))
 
-    def asof_join(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
-        return pd.merge_asof(left, right, on="time", by="id")
+    def merge_ordered(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
+        return pd.merge_ordered(left, right)
 
     df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
-        asof_join, schema="time int, id int, v1 double, v2 string").show()
-    # +--------+---+---+---+
-    # |    time| id| v1| v2|
-    # +--------+---+---+---+
-    # |20000101|  1|1.0|  x|
-    # |20000102|  1|3.0|  x|
-    # |20000101|  2|2.0|  y|
-    # |20000102|  2|4.0|  y|
-    # +--------+---+---+---+
+        merge_ordered, schema="time int, id int, v1 double, v2 string").show()
+    # +--------+---+---+----+
+    # |    time| id| v1|  v2|
+    # +--------+---+---+----+
+    # |20000101|  1|1.0|   x|
+    # |20000102|  1|3.0|null|
+    # |20000101|  2|2.0|   y|
+    # |20000102|  2|4.0|null|
+    # +--------+---+---+----+
 
 
 if __name__ == "__main__":

diff --git a/python/docs/source/getting_started/quickstart_df.ipynb b/python/docs/source/getting_started/quickstart_df.ipynb
@@ -903,11 +903,11 @@
     "    [(20000101, 1, 'x'), (20000101, 2, 'y')],\n",
     "    ('time', 'id', 'v2'))\n",
     "\n",
-    "def asof_join(l, r):\n",
-    "    return pd.merge_asof(l, r, on='time', by='id')\n",
+    "def merge_ordered(l, r):\n",
+    "    return pd.merge_ordered(l, r)\n",
     "\n",
     "df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(\n",
-    "    asof_join, schema='time int, id int, v1 double, v2 string').show()"
+    "    merge_ordered, schema='time int, id int, v1 double, v2 string').show()"
    ]
   },
   {
@@ -1174,4 +1174,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}