From 2d5197349fca4028ff5ae6aaca9acfea5710e8ff Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Wed, 22 May 2024 23:08:21 +0400 Subject: [PATCH] refactor: Deprecate `how="outer"` join type in favour of more correct `how="full"` (left/right are *also* outer joins) --- crates/polars-lazy/src/frame/mod.rs | 12 +- .../src/physical_plan/streaming/checks.rs | 2 +- crates/polars-lazy/src/tests/streaming.rs | 6 +- crates/polars-ops/src/frame/join/args.rs | 10 +- crates/polars-ops/src/frame/join/general.rs | 2 +- .../src/frame/join/hash_join/mod.rs | 6 +- crates/polars-ops/src/frame/join/mod.rs | 25 ++-- .../executors/sinks/joins/generic_build.rs | 8 +- .../sinks/joins/generic_probe_outer.rs | 14 +- crates/polars-pipe/src/pipeline/convert.rs | 2 +- .../optimizer/predicate_pushdown/join.rs | 4 +- .../optimizer/projection_pushdown/joins.rs | 6 +- crates/polars-plan/src/logical_plan/schema.rs | 2 +- crates/polars-sql/src/context.rs | 2 +- crates/polars/src/docs/eager.rs | 2 +- crates/polars/src/docs/lazy.rs | 4 +- crates/polars/tests/it/core/date_like.rs | 2 +- crates/polars/tests/it/core/joins.rs | 52 ++++---- crates/polars/tests/it/joins.rs | 2 +- .../polars/tests/it/lazy/predicate_queries.rs | 2 +- .../tests/it/lazy/projection_queries.rs | 4 +- .../user-guide/transformations/joins.py | 12 +- .../rust/user-guide/transformations/joins.rs | 20 +-- docs/user-guide/transformations/joins.md | 121 ++++++++++-------- py-polars/polars/dataframe/frame.py | 28 ++-- py-polars/polars/functions/eager.py | 10 +- py-polars/polars/lazyframe/frame.py | 52 +++++--- py-polars/polars/type_aliases.py | 2 +- py-polars/pyproject.toml | 1 + py-polars/src/conversion/mod.rs | 6 +- py-polars/src/lazyframe/visitor/nodes.rs | 2 +- py-polars/tests/unit/dataframe/test_df.py | 2 +- .../tests/unit/datatypes/test_categorical.py | 6 +- py-polars/tests/unit/datatypes/test_float.py | 2 +- py-polars/tests/unit/operations/test_join.py | 36 +++--- .../unit/streaming/test_streaming_join.py | 29 +++-- py-polars/tests/unit/test_projections.py | 10 +- 37 files changed, 271 insertions(+), 237 deletions(-) diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index ef81e9bb999a5..03e48a059183c 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -1072,7 +1072,7 @@ impl LazyFrame { self.join(other, vec![], vec![], JoinArgs::new(JoinType::Cross)) } - /// Left join this query with another lazy query. + /// Left outer join this query with another lazy query. /// /// Matches on the values of the expressions `left_on` and `right_on`. For more /// flexible join logic, see [`join`](LazyFrame::join) or @@ -1122,7 +1122,7 @@ impl LazyFrame { ) } - /// Outer join this query with another lazy query. + /// Full outer join this query with another lazy query. /// /// Matches on the values of the expressions `left_on` and `right_on`. For more /// flexible join logic, see [`join`](LazyFrame::join) or @@ -1133,17 +1133,17 @@ impl LazyFrame { /// ```rust /// use polars_core::prelude::*; /// use polars_lazy::prelude::*; - /// fn outer_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { + /// fn full_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame { /// ldf - /// .outer_join(other, col("foo"), col("bar")) + /// .full_join(other, col("foo"), col("bar")) /// } /// ``` - pub fn outer_join>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame { + pub fn full_join>(self, other: LazyFrame, left_on: E, right_on: E) -> LazyFrame { self.join( other, [left_on.into()], [right_on.into()], - JoinArgs::new(JoinType::Outer), + JoinArgs::new(JoinType::Full), ) } diff --git a/crates/polars-lazy/src/physical_plan/streaming/checks.rs b/crates/polars-lazy/src/physical_plan/streaming/checks.rs index 5d1841237bf7e..3592a39cb4feb 100644 --- a/crates/polars-lazy/src/physical_plan/streaming/checks.rs +++ b/crates/polars-lazy/src/physical_plan/streaming/checks.rs @@ -35,7 +35,7 @@ pub(super) fn streamable_join(args: &JoinArgs) -> bool { JoinCoalesce::JoinSpecific | JoinCoalesce::CoalesceColumns ) }, - JoinType::Outer { .. } => true, + JoinType::Full { .. } => true, _ => false, }; supported && !args.validation.needs_checks() diff --git a/crates/polars-lazy/src/tests/streaming.rs b/crates/polars-lazy/src/tests/streaming.rs index 1c51e480636d7..e34c16a34334d 100644 --- a/crates/polars-lazy/src/tests/streaming.rs +++ b/crates/polars-lazy/src/tests/streaming.rs @@ -297,7 +297,7 @@ fn test_streaming_partial() -> PolarsResult<()> { .left_on([col("a")]) .right_on([col("a")]) .suffix("_foo") - .how(JoinType::Outer) + .how(JoinType::Full) .coalesce(JoinCoalesce::CoalesceColumns) .finish(); @@ -400,7 +400,7 @@ fn test_sort_maintain_order_streaming() -> PolarsResult<()> { } #[test] -fn test_streaming_outer_join() -> PolarsResult<()> { +fn test_streaming_full_outer_join() -> PolarsResult<()> { let lf_left = df![ "a"=> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], "b"=> [0, 0, 0, 3, 0, 1, 3, 3, 3, 1, 4, 4, 2, 1, 1, 3, 1, 4, 2, 2], @@ -414,7 +414,7 @@ fn test_streaming_outer_join() -> PolarsResult<()> { .lazy(); let q = lf_left - .outer_join(lf_right, col("a"), col("a")) + .full_join(lf_right, col("a"), col("a")) .sort_by_exprs([all()], SortMultipleOptions::default()); // Toggle so that the join order is swapped. diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index ff57c1fb140e4..4fbc305968347 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -46,7 +46,7 @@ impl JoinCoalesce { Left | Inner => { matches!(self, JoinSpecific | CoalesceColumns) }, - Outer { .. } => { + Full { .. } => { matches!(self, CoalesceColumns) }, #[cfg(feature = "asof_join")] @@ -96,9 +96,9 @@ impl JoinArgs { #[derive(Clone, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum JoinType { - Left, Inner, - Outer, + Left, + Full, #[cfg(feature = "asof_join")] AsOf(AsOfOptions), Cross, @@ -120,7 +120,7 @@ impl Display for JoinType { let val = match self { Left => "LEFT", Inner => "INNER", - Outer { .. } => "OUTER", + Full { .. } => "FULL", #[cfg(feature = "asof_join")] AsOf(_) => "ASOF", Cross => "CROSS", @@ -189,7 +189,7 @@ impl JoinValidation { if !self.needs_checks() { return Ok(()); } - polars_ensure!(matches!(join_type, JoinType::Inner | JoinType::Outer{..} | JoinType::Left), + polars_ensure!(matches!(join_type, JoinType::Inner | JoinType::Full{..} | JoinType::Left), ComputeError: "{self} validation on a {join_type} join is not supported"); Ok(()) } diff --git a/crates/polars-ops/src/frame/join/general.rs b/crates/polars-ops/src/frame/join/general.rs index 74f837c849ec5..2e4d38e2af0dd 100644 --- a/crates/polars-ops/src/frame/join/general.rs +++ b/crates/polars-ops/src/frame/join/general.rs @@ -46,7 +46,7 @@ pub fn _finish_join( Ok(df_left) } -pub fn _coalesce_outer_join( +pub fn _coalesce_full_join( mut df: DataFrame, keys_left: &[&str], keys_right: &[&str], diff --git a/crates/polars-ops/src/frame/join/hash_join/mod.rs b/crates/polars-ops/src/frame/join/hash_join/mod.rs index f6b1ca773ee48..f9291fdf2da1c 100644 --- a/crates/polars-ops/src/frame/join/hash_join/mod.rs +++ b/crates/polars-ops/src/frame/join/hash_join/mod.rs @@ -242,7 +242,7 @@ pub trait JoinDispatch: IntoDf { // indices are in bounds Ok(unsafe { ca_self._finish_anti_semi_join(&idx, slice) }) } - fn _outer_join_from_series( + fn _full_join_from_series( &self, other: &DataFrame, s_left: &Series, @@ -271,10 +271,10 @@ pub trait JoinDispatch: IntoDf { || unsafe { other.take_unchecked(&idx_ca_r) }, ); - let coalesce = args.coalesce.coalesce(&JoinType::Outer); + let coalesce = args.coalesce.coalesce(&JoinType::Full); let out = _finish_join(df_left, df_right, args.suffix.as_deref()); if coalesce { - Ok(_coalesce_outer_join( + Ok(_coalesce_full_join( out?, &[s_left.name()], &[s_right.name()], diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index ffbb1858c0a4a..d284fbe971633 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -25,7 +25,7 @@ pub use cross_join::CrossJoin; use either::Either; #[cfg(feature = "chunked_ids")] use general::create_chunked_index_mapping; -pub use general::{_coalesce_outer_join, _finish_join, _join_suffix_name}; +pub use general::{_coalesce_full_join, _finish_join, _join_suffix_name}; pub use hash_join::*; use hashbrown::hash_map::{Entry, RawEntryMut}; #[cfg(feature = "merge_sorted")] @@ -199,7 +199,7 @@ pub trait DataFrameJoinOps: IntoDf { ._inner_join_from_series(other, s_left, s_right, args, _verbose, drop_names), JoinType::Left => left_df ._left_join_from_series(other, s_left, s_right, args, _verbose, drop_names), - JoinType::Outer => left_df._outer_join_from_series(other, s_left, s_right, args), + JoinType::Full => left_df._full_join_from_series(other, s_left, s_right, args), #[cfg(feature = "semi_anti_join")] JoinType::Anti => left_df._semi_anti_join_from_series( s_left, @@ -271,14 +271,14 @@ pub trait DataFrameJoinOps: IntoDf { JoinType::Cross => { unreachable!() }, - JoinType::Outer => { + JoinType::Full => { let names_left = selected_left.iter().map(|s| s.name()).collect::>(); args.coalesce = JoinCoalesce::KeepColumns; let suffix = args.suffix.clone(); - let out = left_df._outer_join_from_series(other, &lhs_keys, &rhs_keys, args); + let out = left_df._full_join_from_series(other, &lhs_keys, &rhs_keys, args); if should_coalesce { - Ok(_coalesce_outer_join( + Ok(_coalesce_full_join( out?, &names_left, drop_names.as_ref().unwrap(), @@ -341,7 +341,7 @@ pub trait DataFrameJoinOps: IntoDf { self.join(other, left_on, right_on, JoinArgs::new(JoinType::Inner)) } - /// Perform a left join on two DataFrames + /// Perform a left outer join on two DataFrames /// # Example /// /// ```no_run @@ -384,27 +384,22 @@ pub trait DataFrameJoinOps: IntoDf { self.join(other, left_on, right_on, JoinArgs::new(JoinType::Left)) } - /// Perform an outer join on two DataFrames + /// Perform a full outer join on two DataFrames /// # Example /// /// ``` /// # use polars_core::prelude::*; /// # use polars_ops::prelude::*; /// fn join_dfs(left: &DataFrame, right: &DataFrame) -> PolarsResult { - /// left.outer_join(right, ["join_column_left"], ["join_column_right"]) + /// left.full_join(right, ["join_column_left"], ["join_column_right"]) /// } /// ``` - fn outer_join( - &self, - other: &DataFrame, - left_on: I, - right_on: I, - ) -> PolarsResult + fn full_join(&self, other: &DataFrame, left_on: I, right_on: I) -> PolarsResult where I: IntoIterator, S: AsRef, { - self.join(other, left_on, right_on, JoinArgs::new(JoinType::Outer)) + self.join(other, left_on, right_on, JoinArgs::new(JoinType::Full)) } } diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs index 1d1b5b7421755..b271862e7de07 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs @@ -14,7 +14,7 @@ use smartstring::alias::String as SmartString; use super::*; use crate::executors::operators::PlaceHolder; use crate::executors::sinks::joins::generic_probe_inner_left::GenericJoinProbe; -use crate::executors::sinks::joins::generic_probe_outer::GenericOuterJoinProbe; +use crate::executors::sinks::joins::generic_probe_outer::GenericFullOuterJoinProbe; use crate::executors::sinks::utils::{hash_rows, load_vec}; use crate::executors::sinks::HASHMAP_INIT_SIZE; use crate::expressions::PhysicalPipedExpr; @@ -337,9 +337,9 @@ impl Sink for GenericBuild { self.placeholder.replace(Box::new(probe_operator)); Ok(FinalizedSink::Operator) }, - JoinType::Outer => { - let coalesce = self.join_args.coalesce.coalesce(&JoinType::Outer); - let probe_operator = GenericOuterJoinProbe::new( + JoinType::Full => { + let coalesce = self.join_args.coalesce.coalesce(&JoinType::Full); + let probe_operator = GenericFullOuterJoinProbe::new( left_df, materialized_join_cols, suffix, diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs index 77db52b9f42c8..f2807dce24b5e 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_probe_outer.rs @@ -6,7 +6,7 @@ use polars_core::prelude::*; use polars_core::series::IsSorted; use polars_ops::chunked_array::DfTake; use polars_ops::frame::join::_finish_join; -use polars_ops::prelude::_coalesce_outer_join; +use polars_ops::prelude::_coalesce_full_join; use smartstring::alias::String as SmartString; use crate::executors::sinks::joins::generic_build::*; @@ -18,7 +18,7 @@ use crate::expressions::PhysicalPipedExpr; use crate::operators::{DataChunk, Operator, OperatorResult, PExecutionContext}; #[derive(Clone)] -pub struct GenericOuterJoinProbe { +pub struct GenericFullOuterJoinProbe { /// all chunks are stacked into a single dataframe /// the dataframe is not rechunked. df_a: Arc, @@ -58,7 +58,7 @@ pub struct GenericOuterJoinProbe { key_names_right: Arc<[SmartString]>, } -impl GenericOuterJoinProbe { +impl GenericFullOuterJoinProbe { #[allow(clippy::too_many_arguments)] pub(super) fn new( df_a: DataFrame, @@ -75,7 +75,7 @@ impl GenericOuterJoinProbe { key_names_left: Arc<[SmartString]>, key_names_right: Arc<[SmartString]>, ) -> Self { - GenericOuterJoinProbe { + GenericFullOuterJoinProbe { df_a: Arc::new(df_a), df_b_dummy: None, materialized_join_cols, @@ -152,7 +152,7 @@ impl GenericOuterJoinProbe { .iter() .map(|s| s.as_str()) .collect::>(); - Ok(_coalesce_outer_join( + Ok(_coalesce_full_join( out, &l, &r, @@ -287,7 +287,7 @@ impl GenericOuterJoinProbe { } } -impl Operator for GenericOuterJoinProbe { +impl Operator for GenericFullOuterJoinProbe { fn execute( &mut self, context: &PExecutionContext, @@ -310,6 +310,6 @@ impl Operator for GenericOuterJoinProbe { Box::new(new) } fn fmt(&self) -> &str { - "generic_outer_join_probe" + "generic_full_join_probe" } } diff --git a/crates/polars-pipe/src/pipeline/convert.rs b/crates/polars-pipe/src/pipeline/convert.rs index e094185c4519c..46d9482283b8b 100644 --- a/crates/polars-pipe/src/pipeline/convert.rs +++ b/crates/polars-pipe/src/pipeline/convert.rs @@ -299,7 +299,7 @@ where placeholder, )) as Box }, - JoinType::Outer { .. } => { + JoinType::Full { .. } => { // First get the names before we (potentially) swap. let key_names_left = join_columns_left .iter() diff --git a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/join.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/join.rs index 1cb5931d95bbd..7ca6cb67aec0c 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/join.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/join.rs @@ -70,7 +70,7 @@ fn join_produces_null(how: &JoinType) -> LeftRight { { match how { JoinType::Left => LeftRight(false, true), - JoinType::Outer { .. } | JoinType::Cross | JoinType::AsOf(_) => LeftRight(true, true), + JoinType::Full { .. } | JoinType::Cross | JoinType::AsOf(_) => LeftRight(true, true), _ => LeftRight(false, false), } } @@ -78,7 +78,7 @@ fn join_produces_null(how: &JoinType) -> LeftRight { { match how { JoinType::Left => LeftRight(false, true), - JoinType::Outer { .. } | JoinType::Cross => LeftRight(true, true), + JoinType::Full { .. } | JoinType::Cross => LeftRight(true, true), _ => LeftRight(false, false), } } diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs index 10e108d260086..0a663e9e91955 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs @@ -257,11 +257,11 @@ pub(super) fn process_join( .unwrap(); already_added_local_to_local_projected.insert(local_name); } - // In outer joins both columns remain. So `add_local=true` also for the right table - let add_local = matches!(options.args.how, JoinType::Outer) + // In full outer joins both columns remain. So `add_local=true` also for the right table + let add_local = matches!(options.args.how, JoinType::Full) && !options.args.coalesce.coalesce(&options.args.how); for e in &right_on { - // In case of outer joins we also add the columns. + // In case of full outer joins we also add the columns. // But before we do that we must check if the column wasn't already added by the lhs. let add_local = if add_local { !already_added_local_to_local_projected.contains(e.output_name()) diff --git a/crates/polars-plan/src/logical_plan/schema.rs b/crates/polars-plan/src/logical_plan/schema.rs index 2ee480c9727b8..f3e3eabddf19b 100644 --- a/crates/polars-plan/src/logical_plan/schema.rs +++ b/crates/polars-plan/src/logical_plan/schema.rs @@ -221,7 +221,7 @@ pub fn set_estimated_row_counts( let (known_size, estimated_size) = options.rows_left; (known_size, estimated_size, filter_count_left) }, - JoinType::Cross | JoinType::Outer { .. } => { + JoinType::Cross | JoinType::Full { .. } => { let (known_size_left, estimated_size_left) = options.rows_left; let (known_size_right, estimated_size_right) = options.rows_right; match (known_size_left, known_size_right) { diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index cc90dd1e10fe4..ab63ad035775b 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -319,7 +319,7 @@ impl SQLContext { lf = match &tbl.join_operator { JoinOperator::CrossJoin => lf.cross_join(rf), JoinOperator::FullOuter(constraint) => { - process_join(lf, rf, constraint, &l_name, &r_name, JoinType::Outer)? + process_join(lf, rf, constraint, &l_name, &r_name, JoinType::Full)? }, JoinOperator::Inner(constraint) => { process_join(lf, rf, constraint, &l_name, &r_name, JoinType::Inner)? diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs index 7ea159c2ee8e7..a62872e8059d5 100644 --- a/crates/polars/src/docs/eager.rs +++ b/crates/polars/src/docs/eager.rs @@ -395,7 +395,7 @@ //! // join on a single column //! temp.left_join(&rain, ["days"], ["days"]); //! temp.inner_join(&rain, ["days"], ["days"]); -//! temp.outer_join(&rain, ["days"], ["days"]); +//! temp.full_join(&rain, ["days"], ["days"]); //! //! // join on multiple columns //! temp.join(&rain, vec!["days", "other"], vec!["days", "other"], JoinArgs::new(JoinType::Left)); diff --git a/crates/polars/src/docs/lazy.rs b/crates/polars/src/docs/lazy.rs index f4aa6e1cd8f11..c913674901304 100644 --- a/crates/polars/src/docs/lazy.rs +++ b/crates/polars/src/docs/lazy.rs @@ -145,7 +145,7 @@ //! let lf_a = df_a.clone().lazy(); //! let lf_b = df_b.clone().lazy(); //! -//! let joined = lf_a.join(lf_b, vec![col("a")], vec![col("foo")], JoinArgs::new(JoinType::Outer)).collect()?; +//! let joined = lf_a.join(lf_b, vec![col("a")], vec![col("foo")], JoinArgs::new(JoinType::Full)).collect()?; //! // joined: //! //! // ╭─────┬─────┬─────┬──────┬─────────╮ @@ -172,7 +172,7 @@ //! //! # let lf_a = df_a.clone().lazy(); //! # let lf_b = df_b.clone().lazy(); -//! let outer = lf_a.outer_join(lf_b, col("a"), col("foo")).collect()?; +//! let outer = lf_a.full_join(lf_b, col("a"), col("foo")).collect()?; //! //! # let lf_a = df_a.clone().lazy(); //! # let lf_b = df_b.clone().lazy(); diff --git a/crates/polars/tests/it/core/date_like.rs b/crates/polars/tests/it/core/date_like.rs index 48541d110ecd9..9bdbab80c2e9f 100644 --- a/crates/polars/tests/it/core/date_like.rs +++ b/crates/polars/tests/it/core/date_like.rs @@ -22,7 +22,7 @@ fn test_datelike_join() -> PolarsResult<()> { DataType::Datetime(TimeUnit::Nanoseconds, None) )); - let out = df.outer_join(&df.clone(), ["bar"], ["bar"])?; + let out = df.full_join(&df.clone(), ["bar"], ["bar"])?; assert!(matches!( out.column("bar")?.dtype(), DataType::Datetime(TimeUnit::Nanoseconds, None) diff --git a/crates/polars/tests/it/core/joins.rs b/crates/polars/tests/it/core/joins.rs index d4240e5bfa3d2..ac2acb8d91db1 100644 --- a/crates/polars/tests/it/core/joins.rs +++ b/crates/polars/tests/it/core/joins.rs @@ -113,13 +113,13 @@ fn test_left_join() { #[test] #[cfg_attr(miri, ignore)] -fn test_outer_join() -> PolarsResult<()> { +fn test_full_outer_join() -> PolarsResult<()> { let (temp, rain) = create_frames(); let joined = temp.join( &rain, ["days"], ["days"], - JoinArgs::new(JoinType::Outer).with_coalesce(JoinCoalesce::CoalesceColumns), + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), )?; assert_eq!(joined.height(), 5); assert_eq!(joined.column("days")?.sum::().unwrap(), 7); @@ -139,7 +139,7 @@ fn test_outer_join() -> PolarsResult<()> { &df_right, ["a"], ["a"], - JoinArgs::new(JoinType::Outer).with_coalesce(JoinCoalesce::CoalesceColumns), + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), )?; assert_eq!(out.column("c_right")?.null_count(), 1); @@ -248,19 +248,19 @@ fn test_join_multiple_columns() { .unwrap() .equals_missing(joined_inner.column("ham").unwrap())); - let joined_outer_hack = df_a.outer_join(&df_b, ["dummy"], ["dummy"]).unwrap(); - let joined_outer = df_a + let joined_full_outer_hack = df_a.full_join(&df_b, ["dummy"], ["dummy"]).unwrap(); + let joined_full_outer = df_a .join( &df_b, ["a", "b"], ["foo", "bar"], - JoinArgs::new(JoinType::Outer).with_coalesce(JoinCoalesce::CoalesceColumns), + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), ) .unwrap(); - assert!(joined_outer_hack + assert!(joined_full_outer_hack .column("ham") .unwrap() - .equals_missing(joined_outer.column("ham").unwrap())); + .equals_missing(joined_full_outer.column("ham").unwrap())); } #[test] @@ -300,7 +300,7 @@ fn test_join_categorical() { assert_eq!(Vec::from(ca), correct_ham); // test dispatch - for jt in [JoinType::Left, JoinType::Inner, JoinType::Outer] { + for jt in [JoinType::Left, JoinType::Inner, JoinType::Full] { let out = df_a.join(&df_b, ["b"], ["bar"], jt.into()).unwrap(); let out = out.column("b").unwrap(); assert_eq!( @@ -348,11 +348,11 @@ fn empty_df_join() -> PolarsResult<()> { assert_eq!(out.height(), 0); let out = empty_df.left_join(&df, ["key"], ["key"]).unwrap(); assert_eq!(out.height(), 0); - let out = empty_df.outer_join(&df, ["key"], ["key"]).unwrap(); + let out = empty_df.full_join(&df, ["key"], ["key"]).unwrap(); assert_eq!(out.height(), 1); df.left_join(&empty_df, ["key"], ["key"])?; df.inner_join(&empty_df, ["key"], ["key"])?; - df.outer_join(&empty_df, ["key"], ["key"])?; + df.full_join(&empty_df, ["key"], ["key"])?; let empty: Vec = vec![]; let _empty_df = DataFrame::new(vec![ @@ -458,24 +458,24 @@ fn test_joins_with_duplicates() -> PolarsResult<()> { assert_eq!(df_left_join.column("int_col")?.null_count(), 0); assert_eq!(df_left_join.column("dbl_col")?.null_count(), 1); - let df_outer_join = df_left + let df_full_outer_join = df_left .join( &df_right, ["col1"], ["join_col1"], - JoinArgs::new(JoinType::Outer).with_coalesce(JoinCoalesce::CoalesceColumns), + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), ) .unwrap(); // ensure the column names don't get swapped by the drop we do assert_eq!( - df_outer_join.get_column_names(), + df_full_outer_join.get_column_names(), &["col1", "int_col", "dbl_col"] ); - assert_eq!(df_outer_join.height(), 12); - assert_eq!(df_outer_join.column("col1")?.null_count(), 0); - assert_eq!(df_outer_join.column("int_col")?.null_count(), 1); - assert_eq!(df_outer_join.column("dbl_col")?.null_count(), 1); + assert_eq!(df_full_outer_join.height(), 12); + assert_eq!(df_full_outer_join.column("col1")?.null_count(), 0); + assert_eq!(df_full_outer_join.column("int_col")?.null_count(), 1); + assert_eq!(df_full_outer_join.column("dbl_col")?.null_count(), 1); Ok(()) } @@ -530,20 +530,20 @@ fn test_multi_joins_with_duplicates() -> PolarsResult<()> { assert_eq!(df_left_join.column("int_col")?.null_count(), 0); assert_eq!(df_left_join.column("dbl_col")?.null_count(), 1); - let df_outer_join = df_left + let df_full_outer_join = df_left .join( &df_right, &["col1", "join_col2"], &["join_col1", "col2"], - JoinArgs::new(JoinType::Outer).with_coalesce(JoinCoalesce::CoalesceColumns), + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), ) .unwrap(); - assert_eq!(df_outer_join.height(), 12); - assert_eq!(df_outer_join.column("col1")?.null_count(), 0); - assert_eq!(df_outer_join.column("join_col2")?.null_count(), 0); - assert_eq!(df_outer_join.column("int_col")?.null_count(), 1); - assert_eq!(df_outer_join.column("dbl_col")?.null_count(), 1); + assert_eq!(df_full_outer_join.height(), 12); + assert_eq!(df_full_outer_join.column("col1")?.null_count(), 0); + assert_eq!(df_full_outer_join.column("join_col2")?.null_count(), 0); + assert_eq!(df_full_outer_join.column("int_col")?.null_count(), 1); + assert_eq!(df_full_outer_join.column("dbl_col")?.null_count(), 1); Ok(()) } @@ -578,7 +578,7 @@ fn test_join_floats() -> PolarsResult<()> { &df_b, vec!["a", "c"], vec!["foo", "bar"], - JoinArgs::new(JoinType::Outer).with_coalesce(JoinCoalesce::CoalesceColumns), + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), )?; assert_eq!( out.dtypes(), diff --git a/crates/polars/tests/it/joins.rs b/crates/polars/tests/it/joins.rs index 80e9c31739b23..37ed6e2720d5c 100644 --- a/crates/polars/tests/it/joins.rs +++ b/crates/polars/tests/it/joins.rs @@ -23,7 +23,7 @@ fn join_nans_outer() -> PolarsResult<()> { .with(a2) .left_on(vec![col("w"), col("t")]) .right_on(vec![col("w"), col("t")]) - .how(JoinType::Outer) + .how(JoinType::Full) .coalesce(JoinCoalesce::CoalesceColumns) .join_nulls(true) .finish() diff --git a/crates/polars/tests/it/lazy/predicate_queries.rs b/crates/polars/tests/it/lazy/predicate_queries.rs index c5304149d682c..192c6150d7c0c 100644 --- a/crates/polars/tests/it/lazy/predicate_queries.rs +++ b/crates/polars/tests/it/lazy/predicate_queries.rs @@ -164,7 +164,7 @@ fn test_predicate_pushdown_blocked_by_outer_join() -> PolarsResult<()> { "b" => ["b2", "b3"], "c" => ["c2", "c3"] }?; - let df = df1.lazy().outer_join(df2.lazy(), col("b"), col("b")); + let df = df1.lazy().full_join(df2.lazy(), col("b"), col("b")); let out = df.filter(col("a").eq(lit("a1"))).collect()?; let null: Option<&str> = None; let expected = df![ diff --git a/crates/polars/tests/it/lazy/projection_queries.rs b/crates/polars/tests/it/lazy/projection_queries.rs index 56a43e6efed49..496b13ab0aea5 100644 --- a/crates/polars/tests/it/lazy/projection_queries.rs +++ b/crates/polars/tests/it/lazy/projection_queries.rs @@ -34,7 +34,7 @@ fn test_swap_rename() -> PolarsResult<()> { } #[test] -fn test_outer_join_with_column_2988() -> PolarsResult<()> { +fn test_full_outer_join_with_column_2988() -> PolarsResult<()> { let ldf1 = df![ "key1" => ["foo", "bar"], "key2" => ["foo", "bar"], @@ -54,7 +54,7 @@ fn test_outer_join_with_column_2988() -> PolarsResult<()> { ldf2, [col("key1"), col("key2")], [col("key1"), col("key2")], - JoinArgs::new(JoinType::Outer).with_coalesce(JoinCoalesce::CoalesceColumns), + JoinArgs::new(JoinType::Full).with_coalesce(JoinCoalesce::CoalesceColumns), ) .with_columns([col("key1")]) .collect()?; diff --git a/docs/src/python/user-guide/transformations/joins.py b/docs/src/python/user-guide/transformations/joins.py index 663d68b495172..aa853776fc5ee 100644 --- a/docs/src/python/user-guide/transformations/joins.py +++ b/docs/src/python/user-guide/transformations/joins.py @@ -36,17 +36,17 @@ print(df_left_join) # --8<-- [end:left] -# --8<-- [start:outer] -df_outer_join = df_customers.join(df_orders, on="customer_id", how="outer") +# --8<-- [start:full] +df_outer_join = df_customers.join(df_orders, on="customer_id", how="full") print(df_outer_join) -# --8<-- [end:outer] +# --8<-- [end:full] -# --8<-- [start:outer_coalesce] +# --8<-- [start:full_coalesce] df_outer_coalesce_join = df_customers.join( - df_orders, on="customer_id", how="outer_coalesce" + df_orders, on="customer_id", how="full", coalesce=True ) print(df_outer_coalesce_join) -# --8<-- [end:outer_coalesce] +# --8<-- [end:full_coalesce] # --8<-- [start:df3] df_colors = pl.DataFrame( diff --git a/docs/src/rust/user-guide/transformations/joins.rs b/docs/src/rust/user-guide/transformations/joins.rs index cb557d31be184..2def859242af2 100644 --- a/docs/src/rust/user-guide/transformations/joins.rs +++ b/docs/src/rust/user-guide/transformations/joins.rs @@ -50,33 +50,33 @@ fn main() -> Result<(), Box> { println!("{}", &df_left_join); // --8<-- [end:left] - // --8<-- [start:outer] - let df_outer_join = df_customers + // --8<-- [start:full] + let df_full_join = df_customers .clone() .lazy() .join( df_orders.clone().lazy(), [col("customer_id")], [col("customer_id")], - JoinArgs::new(JoinType::Outer), + JoinArgs::new(JoinType::Full), ) .collect()?; - println!("{}", &df_outer_join); - // --8<-- [end:outer] + println!("{}", &df_full_join); + // --8<-- [end:full] - // --8<-- [start:outer_coalesce] - let df_outer_join = df_customers + // --8<-- [start:full_coalesce] + let df_full_join = df_customers .clone() .lazy() .join( df_orders.clone().lazy(), [col("customer_id")], [col("customer_id")], - JoinArgs::new(JoinType::Outer), + JoinArgs::new(JoinType::Full), ) .collect()?; - println!("{}", &df_outer_join); - // --8<-- [end:outer_coalesce] + println!("{}", &df_full_join); + // --8<-- [end:full_coalesce] // --8<-- [start:df3] let df_colors = df!( diff --git a/docs/user-guide/transformations/joins.md b/docs/user-guide/transformations/joins.md index 70efcce5f310e..598179ded0a95 100644 --- a/docs/user-guide/transformations/joins.md +++ b/docs/user-guide/transformations/joins.md @@ -4,25 +4,28 @@ Polars supports the following join strategies by specifying the `how` argument: -| Strategy | Description | -| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `inner` | Returns row with matching keys in _both_ frames. Non-matching rows in either the left or right frame are discarded. | -| `left` | Returns all rows in the left dataframe, whether or not a match in the right-frame is found. Non-matching rows have their right columns null-filled. | -| `outer` | Returns all rows from both the left and right dataframe. If no match is found in one frame, columns from the other frame are null-filled. | -| `outer_coalesce` | Returns all rows from both the left and right dataframe. This is similar to `outer`, but with the key columns being merged. | -| `cross` | Returns the Cartesian product of all rows from the left frame with all rows from the right frame. Duplicates rows are retained; the table length of `A` cross-joined with `B` is always `len(A) × len(B)`. | -| `semi` | Returns all rows from the left frame in which the join key is also present in the right frame. | -| `anti` | Returns all rows from the left frame in which the join key is _not_ present in the right frame. | +| Strategy | Description | +|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `inner` | Returns row with matching keys in _both_ frames. Non-matching rows in either the left or right frame are discarded. | +| `left` | Returns all rows in the left dataframe, whether or not a match in the right-frame is found. Non-matching rows have their right columns null-filled. | +| `full` | Returns all rows from both the left and right dataframe. If no match is found in one frame, columns from the other frame are null-filled. | +| `cross` | Returns the Cartesian product of all rows from the left frame with all rows from the right frame. Duplicates rows are retained; the table length of `A` cross-joined with `B` is always `len(A) × len(B)`. | +| `semi` | Returns all rows from the left frame in which the join key is also present in the right frame. | +| `anti` | Returns all rows from the left frame in which the join key is _not_ present in the right frame. | + +A separate `coalesce` parameter determines whether to merge key columns with the same name from the left and right +frames. ### Inner join -An `inner` join produces a `DataFrame` that contains only the rows where the join key exists in both `DataFrames`. Let's take for example the following two `DataFrames`: +An `inner` join produces a `DataFrame` that contains only the rows where the join key exists in both `DataFrames`. Let's +take for example the following two `DataFrames`: {{code_block('user-guide/transformations/joins','innerdf',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:setup" ---8<-- "python/user-guide/transformations/joins.py:innerdf" +--8 < -- "python/user-guide/transformations/joins.py:setup" +--8 < -- "python/user-guide/transformations/joins.py:innerdf" ```

@@ -30,59 +33,62 @@ An `inner` join produces a `DataFrame` that contains only the rows where the joi {{code_block('user-guide/transformations/joins','innerdf2',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:innerdf2" +--8 < -- "python/user-guide/transformations/joins.py:innerdf2" ``` -To get a `DataFrame` with the orders and their associated customer we can do an `inner` join on the `customer_id` column: +To get a `DataFrame` with the orders and their associated customer we can do an `inner` join on the `customer_id` +column: {{code_block('user-guide/transformations/joins','inner',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:inner" +--8 < -- "python/user-guide/transformations/joins.py:inner" ``` ### Left join -The `left` join produces a `DataFrame` that contains all the rows from the left `DataFrame` and only the rows from the right `DataFrame` where the join key exists in the left `DataFrame`. If we now take the example from above and want to have a `DataFrame` with all the customers and their associated orders (regardless of whether they have placed an order or not) we can do a `left` join: +The `left` outer join produces a `DataFrame` that contains all the rows from the left `DataFrame` and only the rows from +the right `DataFrame` where the join key exists in the left `DataFrame`. If we now take the example from above and want +to have a `DataFrame` with all the customers and their associated orders (regardless of whether they have placed an +order or not) we can do a `left` join: {{code_block('user-guide/transformations/joins','left',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:left" +--8 < -- "python/user-guide/transformations/joins.py:left" ``` -Notice, that the fields for the customer with the `customer_id` of `3` are null, as there are no orders for this customer. +Notice, that the fields for the customer with the `customer_id` of `3` are null, as there are no orders for this +customer. ### Outer join -The `outer` join produces a `DataFrame` that contains all the rows from both `DataFrames`. Columns are null, if the join key does not exist in the source `DataFrame`. Doing an `outer` join on the two `DataFrames` from above produces a similar `DataFrame` to the `left` join: +The `full` outer join produces a `DataFrame` that contains all the rows from both `DataFrames`. Columns are null, if the +join key does not exist in the source `DataFrame`. Doing a `full` outer join on the two `DataFrames` from above produces +a similar `DataFrame` to the `left` join: -{{code_block('user-guide/transformations/joins','outer',['join'])}} +{{code_block('user-guide/transformations/joins','full',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:outer" +--8 < -- "python/user-guide/transformations/joins.py:full" ``` -### Outer coalesce join - -The `outer_coalesce` join combines all rows from both `DataFrames` like an `outer` join, but it merges the join keys into a single column by coalescing the values. This ensures a unified view of the join key, avoiding nulls in key columns whenever possible. Let's compare it with the outer join using the two `DataFrames` we used above: - -{{code_block('user-guide/transformations/joins','outer_coalesce',['join'])}} +{{code_block('user-guide/transformations/joins','full_coalesce',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:outer_coalesce" +--8 < -- "python/user-guide/transformations/joins.py:full_coalesce" ``` -In contrast to an `outer` join, where `customer_id` and `customer_id_right` columns would remain separate, the `outer_coalesce` join merges these columns into a single `customer_id` column. - ### Cross join -A `cross` join is a Cartesian product of the two `DataFrames`. This means that every row in the left `DataFrame` is joined with every row in the right `DataFrame`. The `cross` join is useful for creating a `DataFrame` with all possible combinations of the columns in two `DataFrames`. Let's take for example the following two `DataFrames`. +A `cross` join is a Cartesian product of the two `DataFrames`. This means that every row in the left `DataFrame` is +joined with every row in the right `DataFrame`. The `cross` join is useful for creating a `DataFrame` with all possible +combinations of the columns in two `DataFrames`. Let's take for example the following two `DataFrames`. {{code_block('user-guide/transformations/joins','df3',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df3" +--8 < -- "python/user-guide/transformations/joins.py:df3" ```

@@ -90,7 +96,7 @@ A `cross` join is a Cartesian product of the two `DataFrames`. This means that e {{code_block('user-guide/transformations/joins','df4',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df4" +--8 < -- "python/user-guide/transformations/joins.py:df4" ``` We can now create a `DataFrame` containing all possible combinations of the colors and sizes with a `cross` join: @@ -98,21 +104,24 @@ We can now create a `DataFrame` containing all possible combinations of the colo {{code_block('user-guide/transformations/joins','cross',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:cross" +--8 < -- "python/user-guide/transformations/joins.py:cross" ```
-The `inner`, `left`, `outer` and `cross` join strategies are standard amongst dataframe libraries. We provide more details on the less familiar `semi`, `anti` and `asof` join strategies below. +The `inner`, `left`, `full` and `cross` join strategies are standard amongst dataframe libraries. We provide more +details on the less familiar `semi`, `anti` and `asof` join strategies below. ### Semi join -The `semi` join returns all rows from the left frame in which the join key is also present in the right frame. Consider the following scenario: a car rental company has a `DataFrame` showing the cars that it owns with each car having a unique `id`. +The `semi` join returns all rows from the left frame in which the join key is also present in the right frame. Consider +the following scenario: a car rental company has a `DataFrame` showing the cars that it owns with each car having a +unique `id`. {{code_block('user-guide/transformations/joins','df5',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df5" +--8 < -- "python/user-guide/transformations/joins.py:df5" ``` The company has another `DataFrame` showing each repair job carried out on a vehicle. @@ -120,17 +129,18 @@ The company has another `DataFrame` showing each repair job carried out on a veh {{code_block('user-guide/transformations/joins','df6',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df6" +--8 < -- "python/user-guide/transformations/joins.py:df6" ``` You want to answer this question: which of the cars have had repairs carried out? -An inner join does not answer this question directly as it produces a `DataFrame` with multiple rows for each car that has had multiple repair jobs: +An inner join does not answer this question directly as it produces a `DataFrame` with multiple rows for each car that +has had multiple repair jobs: {{code_block('user-guide/transformations/joins','inner2',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:inner2" +--8 < -- "python/user-guide/transformations/joins.py:inner2" ``` However, a semi join produces a single row for each car that has had a repair job carried out. @@ -138,17 +148,19 @@ However, a semi join produces a single row for each car that has had a repair jo {{code_block('user-guide/transformations/joins','semi',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:semi" +--8 < -- "python/user-guide/transformations/joins.py:semi" ``` ### Anti join -Continuing this example, an alternative question might be: which of the cars have **not** had a repair job carried out? An anti join produces a `DataFrame` showing all the cars from `df_cars` where the `id` is not present in the `df_repairs` `DataFrame`. +Continuing this example, an alternative question might be: which of the cars have **not** had a repair job carried out? +An anti join produces a `DataFrame` showing all the cars from `df_cars` where the `id` is not present in +the `df_repairs` `DataFrame`. {{code_block('user-guide/transformations/joins','anti',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:anti" +--8 < -- "python/user-guide/transformations/joins.py:anti" ``` ## Asof join @@ -156,12 +168,13 @@ Continuing this example, an alternative question might be: which of the cars hav An `asof` join is like a left join except that we match on nearest key rather than equal keys. In Polars we can do an asof join with the `join_asof` method. -Consider the following scenario: a stock market broker has a `DataFrame` called `df_trades` showing transactions it has made for different stocks. +Consider the following scenario: a stock market broker has a `DataFrame` called `df_trades` showing transactions it has +made for different stocks. {{code_block('user-guide/transformations/joins','df7',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df7" +--8 < -- "python/user-guide/transformations/joins.py:df7" ``` The broker has another `DataFrame` called `df_quotes` showing prices it has quoted for these stocks. @@ -169,27 +182,31 @@ The broker has another `DataFrame` called `df_quotes` showing prices it has quot {{code_block('user-guide/transformations/joins','df8',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:df8" +--8 < -- "python/user-guide/transformations/joins.py:df8" ``` -You want to produce a `DataFrame` showing for each trade the most recent quote provided _before_ the trade. You do this with `join_asof` (using the default `strategy = "backward"`). -To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the stock column with `by="stock"`. +You want to produce a `DataFrame` showing for each trade the most recent quote provided _before_ the trade. You do this +with `join_asof` (using the default `strategy = "backward"`). +To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the +stock column with `by="stock"`. {{code_block('user-guide/transformations/joins','asof',['join_asof'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:asofpre" ---8<-- "python/user-guide/transformations/joins.py:asof" +--8 < -- "python/user-guide/transformations/joins.py:asofpre" +--8 < -- "python/user-guide/transformations/joins.py:asof" ``` -If you want to make sure that only quotes within a certain time range are joined to the trades you can specify the `tolerance` argument. In this case we want to make sure that the last preceding quote is within 1 minute of the trade so we set `tolerance = "1m"`. +If you want to make sure that only quotes within a certain time range are joined to the trades you can specify +the `tolerance` argument. In this case we want to make sure that the last preceding quote is within 1 minute of the +trade so we set `tolerance = "1m"`. === ":fontawesome-brands-python: Python" ```python ---8<-- "python/user-guide/transformations/joins.py:asof2" +--8 < -- "python/user-guide/transformations/joins.py:asof2" ``` ```python exec="on" result="text" session="user-guide/transformations/joins" ---8<-- "python/user-guide/transformations/joins.py:asof2" +--8 < -- "python/user-guide/transformations/joins.py:asof2" ``` diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index b5a2684643557..b5617a05dbfb3 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -6547,7 +6547,7 @@ def join( DataFrame to join with. on Name(s) of the join columns in both DataFrames. - how : {'inner', 'left', 'outer', 'semi', 'anti', 'cross', 'outer_coalesce'} + how : {'inner', 'left', 'full', 'semi', 'anti', 'cross'} Join strategy. * *inner* @@ -6555,10 +6555,8 @@ def join( * *left* Returns all rows from the left table, and the matched rows from the right table - * *outer* + * *full* Returns all rows when there is a match in either left or right table - * *outer_coalesce* - Same as 'outer', but coalesces the key columns * *cross* Returns the Cartesian product of rows from both tables * *semi* @@ -6631,7 +6629,7 @@ def join( │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ - >>> df.join(other_df, on="ham", how="outer") + >>> df.join(other_df, on="ham", how="full") shape: (4, 5) ┌──────┬──────┬──────┬───────┬───────────┐ │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ @@ -10666,7 +10664,7 @@ def update( self, other: DataFrame, on: str | Sequence[str] | None = None, - how: Literal["left", "inner", "outer"] = "left", + how: Literal["left", "inner", "full"] = "left", *, left_on: str | Sequence[str] | None = None, right_on: str | Sequence[str] | None = None, @@ -10690,11 +10688,11 @@ def update( on Column names that will be joined on. If set to `None` (default), the implicit row index of each frame is used as a join key. - how : {'left', 'inner', 'outer'} + how : {'left', 'inner', 'full'} * 'left' will keep all rows from the left table; rows may be duplicated if multiple rows in the right frame match the left row's key. * 'inner' keeps only those rows where the key exists in both frames. - * 'outer' will update existing rows where the key matches while also + * 'full' will update existing rows where the key matches while also adding any new rows contained in the given frame. left_on Join column(s) of the left DataFrame. @@ -10766,10 +10764,10 @@ def update( │ 3 ┆ -99 │ └─────┴─────┘ - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: + Update `df` values with the non-null values in `new_df`, using a full + outer join strategy that defines explicit join columns in each frame: - >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="full") shape: (5, 2) ┌─────┬─────┐ │ A ┆ B │ @@ -10783,12 +10781,10 @@ def update( │ 5 ┆ -66 │ └─────┴─────┘ - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: + Update `df` values including null values in `new_df`, using a full outer + join strategy that defines explicit join columns in each frame: - >>> df.update( - ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True - ... ) + >>> df.update(new_df, left_on="A", right_on="C", how="full", include_nulls=True) shape: (5, 2) ┌─────┬──────┐ │ A ┆ B │ diff --git a/py-polars/polars/functions/eager.py b/py-polars/polars/functions/eager.py index cefa60eedd6ad..9230d985c89c4 100644 --- a/py-polars/polars/functions/eager.py +++ b/py-polars/polars/functions/eager.py @@ -156,12 +156,12 @@ def concat( msg = "'align' strategy requires at least one common column" raise InvalidOperationError(msg) - # align the frame data using an outer join with no suffix-resolution + # align the frame data using a full outer join with no suffix-resolution # (so we raise an error in case of column collision, like "horizontal") lf: LazyFrame = reduce( lambda x, y: ( - x.join(y, how="outer", on=common_cols, suffix="_PL_CONCAT_RIGHT") - # Coalesce outer join columns + x.join(y, how="full", on=common_cols, suffix="_PL_CONCAT_RIGHT") + # Coalesce full outer join columns .with_columns( [ F.coalesce([name, f"{name}_PL_CONCAT_RIGHT"]) @@ -262,7 +262,7 @@ def concat( def _alignment_join( *idx_frames: tuple[int, LazyFrame], align_on: list[str], - how: JoinStrategy = "outer", + how: JoinStrategy = "full", descending: bool | Sequence[bool] = False, ) -> LazyFrame: """Create a single master frame with all rows aligned on the common key values.""" @@ -286,7 +286,7 @@ def join_func( def align_frames( *frames: FrameType, on: str | Expr | Sequence[str] | Sequence[Expr] | Sequence[str | Expr], - how: JoinStrategy = "outer", + how: JoinStrategy = "full", select: str | Expr | Sequence[str | Expr] | None = None, descending: bool | Sequence[bool] = False, ) -> list[FrameType]: diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index bab3073fee2a0..abfd869c5792c 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -3834,7 +3834,7 @@ def join( on Join column of both DataFrames. If set, `left_on` and `right_on` should be None. - how : {'inner', 'left', 'outer', 'semi', 'anti', 'cross', 'outer_coalesce'} + how : {'inner', 'left', 'full', 'semi', 'anti', 'cross'} Join strategy. * *inner* @@ -3842,7 +3842,7 @@ def join( * *left* Returns all rows from the left table, and the matched rows from the right table - * *outer* + * *full* Returns all rows when there is a match in either left or right table * *cross* Returns the Cartesian product of rows from both tables @@ -3917,7 +3917,7 @@ def join( │ 1 ┆ 6.0 ┆ a ┆ x │ │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="outer").collect() + >>> lf.join(other_lf, on="ham", how="full").collect() shape: (4, 5) ┌──────┬──────┬──────┬───────┬───────────┐ │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ @@ -3964,6 +3964,13 @@ def join( msg = f"expected `other` join table to be a LazyFrame, not a {type(other).__name__!r}" raise TypeError(msg) + if how == "outer": + how = "full" + issue_deprecation_warning( + "Use of `how='outer'` should be replaced with `how='full'`.", + version="0.20.29", + ) + if how == "cross": return self._from_pyldf( self._ldf.join( @@ -3992,6 +3999,11 @@ def join( if how == "outer_coalesce": coalesce = True + how = "full" + issue_deprecation_warning( + "Use of `how='outer_coalesce'` should be replaced with `how='full', coalesce=True`.", + version="0.20.29", + ) return self._from_pyldf( self._ldf.join( @@ -5917,7 +5929,7 @@ def update( self, other: LazyFrame, on: str | Sequence[str] | None = None, - how: Literal["left", "inner", "outer"] = "left", + how: Literal["left", "inner", "full"] = "left", *, left_on: str | Sequence[str] | None = None, right_on: str | Sequence[str] | None = None, @@ -5937,11 +5949,11 @@ def update( on Column names that will be joined on. If set to `None` (default), the implicit row index of each frame is used as a join key. - how : {'left', 'inner', 'outer'} + how : {'left', 'inner', 'full'} * 'left' will keep all rows from the left table; rows may be duplicated if multiple rows in the right frame match the left row's key. * 'inner' keeps only those rows where the key exists in both frames. - * 'outer' will update existing rows where the key matches while also + * 'full' will update existing rows where the key matches while also adding any new rows contained in the given frame. left_on Join column(s) of the left DataFrame. @@ -6013,10 +6025,10 @@ def update( │ 3 ┆ -99 │ └─────┴─────┘ - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: + Update `df` values with the non-null values in `new_df`, using a full + outer join strategy that defines explicit join columns in each frame: - >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="full").collect() shape: (5, 2) ┌─────┬─────┐ │ A ┆ B │ @@ -6030,11 +6042,11 @@ def update( │ 5 ┆ -66 │ └─────┴─────┘ - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: + Update `df` values including null values in `new_df`, using a full + outer join strategy that defines explicit join columns in each frame: >>> lf.update( - ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... new_lf, left_on="A", right_on="C", how="full", include_nulls=True ... ).collect() shape: (5, 2) ┌─────┬──────┐ @@ -6049,11 +6061,16 @@ def update( │ 5 ┆ -66 │ └─────┴──────┘ """ - if how not in ("left", "inner", "outer"): - msg = f"`how` must be one of {{'left', 'inner', 'outer'}}; found {how!r}" + if how in ("outer", "outer_coalesce"): + how = "full" + issue_deprecation_warning( + "Use of `how='outer'` should be replaced with `how='full'`.", + version="0.20.29", + ) + + if how not in ("left", "inner", "full"): + msg = f"`how` must be one of {{'left', 'inner', 'full'}}; found {how!r}" raise ValueError(msg) - if how == "outer": - how = "outer_coalesce" # type: ignore[assignment] row_index_used = False if on is None: @@ -6093,7 +6110,7 @@ def update( raise ValueError(msg) # no need to join if *only* join columns are in other (inner/left update only) - if how != "outer_coalesce" and len(other.columns) == len(right_on): # type: ignore[comparison-overlap, redundant-expr] + if how != "full" and len(other.columns) == len(right_on): if row_index_used: return self.drop(row_index_name) return self @@ -6120,6 +6137,7 @@ def update( right_on=right_on, how=how, suffix=tmp_name, + coalesce=True, ) .with_columns( ( diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py index 92daf0b2dd0c9..d65e36be40ea8 100644 --- a/py-polars/polars/type_aliases.py +++ b/py-polars/polars/type_aliases.py @@ -140,7 +140,7 @@ ClosedInterval: TypeAlias = Literal["left", "right", "both", "none"] # ClosedWindow InterpolationMethod: TypeAlias = Literal["linear", "nearest"] JoinStrategy: TypeAlias = Literal[ - "inner", "left", "outer", "semi", "anti", "cross", "outer_coalesce" + "inner", "left", "full", "semi", "anti", "cross", "outer", "outer_coalesce" ] # JoinType RollingInterpolationMethod: TypeAlias = Literal[ "nearest", "higher", "lower", "midpoint", "linear" diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index 9ac0df0cd121a..17e74da105e93 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -235,6 +235,7 @@ filterwarnings = [ # https://github.com/pola-rs/polars/issues/14466 "ignore:unclosed file.*:ResourceWarning", "ignore:the 'pyxlsb' engine is deprecated.*:DeprecationWarning", + "ignore:Use of `how='outer(_coalesce)?'` should be replaced with `how='full'.*:DeprecationWarning", ] xfail_strict = true diff --git a/py-polars/src/conversion/mod.rs b/py-polars/src/conversion/mod.rs index b0ab2ba8019b7..79d90ee88fa93 100644 --- a/py-polars/src/conversion/mod.rs +++ b/py-polars/src/conversion/mod.rs @@ -720,10 +720,10 @@ impl<'py> FromPyObject<'py> for Wrap { let parsed = match &*ob.extract::()? { "inner" => JoinType::Inner, "left" => JoinType::Left, - "outer" => JoinType::Outer, + "full" => JoinType::Full, "outer_coalesce" => { // TODO! deprecate - JoinType::Outer + JoinType::Full }, "semi" => JoinType::Semi, "anti" => JoinType::Anti, @@ -731,7 +731,7 @@ impl<'py> FromPyObject<'py> for Wrap { "cross" => JoinType::Cross, v => { return Err(PyValueError::new_err(format!( - "`how` must be one of {{'inner', 'left', 'outer', 'semi', 'anti', 'cross'}}, got {v}", + "`how` must be one of {{'inner', 'left', 'full', 'semi', 'anti', 'cross'}}, got {v}", ))) }, }; diff --git a/py-polars/src/lazyframe/visitor/nodes.rs b/py-polars/src/lazyframe/visitor/nodes.rs index fe4a3fcf2b3e3..88f7143969b72 100644 --- a/py-polars/src/lazyframe/visitor/nodes.rs +++ b/py-polars/src/lazyframe/visitor/nodes.rs @@ -406,7 +406,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { match options.args.how { JoinType::Left => "left", JoinType::Inner => "inner", - JoinType::Outer => "outer", + JoinType::Full => "full", JoinType::AsOf(_) => return Err(PyNotImplementedError::new_err("asof join")), JoinType::Cross => "cross", JoinType::Semi => "leftsemi", diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index f122e71c530b7..02377153c8636 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -2294,7 +2294,7 @@ def test_join_suffixes() -> None: df_a = pl.DataFrame({"A": [1], "B": [1]}) df_b = pl.DataFrame({"A": [1], "B": [1]}) - join_strategies: list[JoinStrategy] = ["left", "inner", "outer", "cross"] + join_strategies: list[JoinStrategy] = ["left", "inner", "full", "cross"] for how in join_strategies: # no need for an assert, we error if wrong df_a.join(df_b, on="A", suffix="_y", how=how)["B_y"] diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index b644e4b8c3f7e..c310cab18fe1f 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -20,7 +20,7 @@ @StringCache() -def test_categorical_outer_join() -> None: +def test_categorical_full_outer_join() -> None: df1 = pl.DataFrame( [ pl.Series("key1", [42]), @@ -49,7 +49,7 @@ def test_categorical_outer_join() -> None: schema_overrides={"key2": pl.Categorical, "key2_right": pl.Categorical}, ) - out = df1.join(df2, on=["key1", "key2"], how="outer").collect() + out = df1.join(df2, on=["key1", "key2"], how="full").collect() assert_frame_equal(out, expected) dfa = pl.DataFrame( @@ -65,7 +65,7 @@ def test_categorical_outer_join() -> None: ] ) - df = dfa.join(dfb, on="key", how="outer") + df = dfa.join(dfb, on="key", how="full") # the cast is important to test the rev map assert df["key"].cast(pl.String).to_list() == ["bar", None, "foo"] assert df["key_right"].cast(pl.String).to_list() == ["bar", "baz", None] diff --git a/py-polars/tests/unit/datatypes/test_float.py b/py-polars/tests/unit/datatypes/test_float.py index eac6c70f92adc..eaa8a045b5972 100644 --- a/py-polars/tests/unit/datatypes/test_float.py +++ b/py-polars/tests/unit/datatypes/test_float.py @@ -232,7 +232,7 @@ def test_joins() -> None: ) assert_series_equal(expect, out) - how = "outer" + how = "full" expect = pl.Series("rhs", [True, True, True, True, None, None, True]) out = ( df.join(rhs, on=join_on, how=how) # type: ignore[arg-type] diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index 3722cdc356089..77fa901121038 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -174,7 +174,7 @@ def test_join() -> None: assert joined["c_right"].is_null().sum() == 1 assert_series_equal(joined["b"], pl.Series("b", [1, 3, 2, 2, 4])) - joined = df_left.join(df_right, left_on="a", right_on="a", how="outer").sort("a") + joined = df_left.join(df_right, left_on="a", right_on="a", how="full").sort("a") assert joined["c_right"].null_count() == 1 assert joined["c"].null_count() == 1 assert joined["b"].null_count() == 1 @@ -221,7 +221,7 @@ def test_joins_dispatch() -> None: [pl.col("date").str.strptime(pl.Date), pl.col("datetime").cast(pl.Datetime)] ) - join_strategies: list[JoinStrategy] = ["left", "inner", "outer"] + join_strategies: list[JoinStrategy] = ["left", "inner", "full"] for how in join_strategies: dfa.join(dfa, on=["a", "b", "date", "datetime"], how=how) dfa.join(dfa, on=["date", "datetime"], how=how) @@ -572,9 +572,9 @@ def test_update() -> None: a.update(b, how="inner", left_on="a", right_on="c").collect().to_series() ) print(a, b) - print(a.update(b.rename({"b": "a"}), how="outer", on="a").collect()) + print(a.update(b.rename({"b": "a"}), how="full", on="a").collect()) assert [1, 2, 3, 4, 5] == sorted( - a.update(b.rename({"b": "a"}), how="outer", on="a").collect().to_series() + a.update(b.rename({"b": "a"}), how="full", on="a").collect().to_series() ) # check behavior of include_nulls=True @@ -590,7 +590,7 @@ def test_update() -> None: "C": [5, 3, 1], } ) - out = df.update(new_df, left_on="A", right_on="C", how="outer", include_nulls=True) + out = df.update(new_df, left_on="A", right_on="C", how="full", include_nulls=True) expected = pl.DataFrame( { "A": [1, 2, 3, 4, 5], @@ -602,13 +602,13 @@ def test_update() -> None: # edge-case #11684 x = pl.DataFrame({"a": [0, 1]}) y = pl.DataFrame({"a": [2, 3]}) - assert [0, 1, 2, 3] == sorted(x.update(y, on="a", how="outer")["a"].to_list()) + assert [0, 1, 2, 3] == sorted(x.update(y, on="a", how="full")["a"].to_list()) # disallowed join strategies for join_strategy in ("cross", "anti", "semi"): with pytest.raises( ValueError, - match=f"`how` must be one of {{'left', 'inner', 'outer'}}; found '{join_strategy}'", + match=f"`how` must be one of {{'left', 'inner', 'full'}}; found '{join_strategy}'", ): a.update(b, how=join_strategy) # type: ignore[arg-type] @@ -652,19 +652,19 @@ def test_join_sorted_fast_paths_null() -> None: } assert df1.join(df2, on="x", how="anti").to_dict(as_series=False) == {"x": [1]} assert df1.join(df2, on="x", how="semi").to_dict(as_series=False) == {"x": [0, 0]} - assert df1.join(df2, on="x", how="outer").to_dict(as_series=False) == { + assert df1.join(df2, on="x", how="full").to_dict(as_series=False) == { "x": [0, 0, 1, None], "x_right": [0, 0, None, None], "y": [0, 0, None, 1], } -def test_outer_join_list_() -> None: +def test_full_outer_join_list_() -> None: schema = {"id": pl.Int64, "vals": pl.List(pl.Float64)} df1 = pl.DataFrame({"id": [1], "vals": [[]]}, schema=schema) # type: ignore[arg-type] df2 = pl.DataFrame({"id": [2, 3], "vals": [[], [4]]}, schema=schema) # type: ignore[arg-type] - assert df1.join(df2, on="id", how="outer").to_dict(as_series=False) == { + assert df1.join(df2, on="id", how="full").to_dict(as_series=False) == { "id": [None, None, 1], "vals": [None, None, []], "id_right": [2, 3, None], @@ -743,7 +743,7 @@ def test_each_join_validation( } ) - join_strategies: list[JoinStrategy] = ["inner", "outer", "left"] + join_strategies: list[JoinStrategy] = ["inner", "full", "left"] for join_col in ["id", "id_str"]: for how in join_strategies: @@ -772,7 +772,7 @@ def test_join_validation_many_keys() -> None: "val2": [1, 2, 3, 4], } ) - for join_type in ["inner", "left", "outer"]: + for join_type in ["inner", "left", "full"]: for val in ["m:m", "m:1", "1:1", "1:m"]: df1.join(df2, on=["val1", "val2"], how=join_type, validate=val) @@ -784,7 +784,7 @@ def test_join_validation_many_keys() -> None: } ) - for join_type in ["inner", "left", "outer"]: + for join_type in ["inner", "left", "full"]: for val in ["1:1", "1:m"]: with pytest.raises(pl.ComputeError): df1.join(df2, on=["val1", "val2"], how=join_type, validate=val) @@ -803,16 +803,16 @@ def test_join_validation_many_keys() -> None: } ) - for join_type in ["inner", "left", "outer"]: + for join_type in ["inner", "left", "full"]: for val in ["m:1", "1:1"]: with pytest.raises(pl.ComputeError): df1.join(df2, on=["val1", "val2"], how=join_type, validate=val) -def test_outer_join_bool() -> None: +def test_full_outer_join_bool() -> None: df1 = pl.DataFrame({"id": [True, False], "val": [1, 2]}) df2 = pl.DataFrame({"id": [True, False], "val": [0, -1]}) - assert df1.join(df2, on="id", how="outer").to_dict(as_series=False) == { + assert df1.join(df2, on="id", how="full").to_dict(as_series=False) == { "id": [True, False], "val": [1, 2], "id_right": [True, False], @@ -820,7 +820,7 @@ def test_outer_join_bool() -> None: } -def test_outer_join_coalesce_different_names_13450() -> None: +def test_full_outer_join_coalesce_different_names_13450() -> None: df1 = pl.DataFrame({"L1": ["a", "b", "c"], "L3": ["b", "c", "d"], "L2": [1, 2, 3]}) df2 = pl.DataFrame({"L3": ["a", "c", "d"], "R2": [7, 8, 9]}) @@ -994,7 +994,7 @@ def test_join_coalesce(how: str) -> None: assert out.columns == ["a", "b", "c"] -@pytest.mark.parametrize("how", ["left", "inner", "outer"]) +@pytest.mark.parametrize("how", ["left", "inner", "full", "outer"]) @typing.no_type_check def test_join_empties(how: str) -> None: df1 = pl.DataFrame({"col1": [], "col2": [], "col3": []}) diff --git a/py-polars/tests/unit/streaming/test_streaming_join.py b/py-polars/tests/unit/streaming/test_streaming_join.py index 0cd75fccfb1b7..cc783fa69ed30 100644 --- a/py-polars/tests/unit/streaming/test_streaming_join.py +++ b/py-polars/tests/unit/streaming/test_streaming_join.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Literal +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd @@ -9,10 +9,13 @@ import polars as pl from polars.testing import assert_frame_equal +if TYPE_CHECKING: + from polars.type_aliases import JoinStrategy + pytestmark = pytest.mark.xdist_group("streaming") -def test_streaming_outer_joins() -> None: +def test_streaming_full_outer_joins() -> None: n = 100 dfa = pl.DataFrame( { @@ -29,12 +32,16 @@ def test_streaming_outer_joins() -> None: } ) - join_strategies: list[Literal["outer", "outer_coalesce"]] = [ - "outer", - "outer_coalesce", + join_strategies: list[tuple[JoinStrategy, bool]] = [ + ("full", False), + ("full", True), ] - for how in join_strategies: - q = dfa.lazy().join(dfb.lazy(), on="a", how=how).sort(["idx"]) + for how, coalesce in join_strategies: + q = ( + dfa.lazy() + .join(dfb.lazy(), on="a", how=how, coalesce=coalesce) + .sort(["idx"]) + ) a = q.collect(streaming=True) b = q.collect(streaming=False) assert_frame_equal(a, b) @@ -172,14 +179,14 @@ def test_join_null_matches(streaming: bool) -> None: df_a.join(df_b, on="a", how="inner").collect(streaming=streaming), expected ) - # Left + # Left outer expected = pl.DataFrame( {"idx_a": [0, 1, 2], "a": [None, 1, 2], "idx_b": [None, 2, 1]} ) assert_frame_equal( df_a.join(df_b, on="a", how="left").collect(streaming=streaming), expected ) - # Outer + # Full outer expected = pl.DataFrame( { "idx_a": [None, 2, 1, None, 0], @@ -188,7 +195,7 @@ def test_join_null_matches(streaming: bool) -> None: "a_right": [None, 2, 1, None, None], } ) - assert_frame_equal(df_a.join(df_b, on="a", how="outer").collect(), expected) + assert_frame_equal(df_a.join(df_b, on="a", how="full").collect(), expected) @pytest.mark.parametrize("streaming", [False, True]) @@ -231,7 +238,7 @@ def test_join_null_matches_multiple_keys(streaming: bool) -> None: } ) assert_frame_equal( - df_a.join(df_b, on=["a", "idx"], how="outer").sort("a").collect(), expected + df_a.join(df_b, on=["a", "idx"], how="full").sort("a").collect(), expected ) diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index e88914b0c1389..976e05fb3e51d 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -129,7 +129,7 @@ def test_unnest_columns_available() -> None: q = df.with_columns( pl.col("genres") .str.split("|") - .list.to_struct(n_field_strategy="max_width", fields=lambda i: f"genre{i+1}") + .list.to_struct(n_field_strategy="max_width", fields=lambda i: f"genre{i + 1}") ).unnest("genres") out = q.collect() @@ -360,13 +360,13 @@ def test_projection_count_11841() -> None: ).collect() -def test_schema_outer_join_projection_pd_13287() -> None: +def test_schema_full_outer_join_projection_pd_13287() -> None: lf = pl.LazyFrame({"a": [1, 1], "b": [2, 3]}) lf2 = pl.LazyFrame({"a": [1, 1], "c": [2, 3]}) assert lf.join( lf2, - how="outer", + how="full", left_on="a", right_on="c", ).with_columns( @@ -374,11 +374,11 @@ def test_schema_outer_join_projection_pd_13287() -> None: ).select("a").collect().to_dict(as_series=False) == {"a": [2, 3, 1, 1]} -def test_projection_pushdown_outer_join_duplicates() -> None: +def test_projection_pushdown_full_outer_join_duplicates() -> None: df1 = pl.DataFrame({"a": [1, 2, 3], "b": [10, 20, 30]}).lazy() df2 = pl.DataFrame({"a": [1, 2, 3], "b": [10, 20, 30]}).lazy() assert ( - df1.join(df2, on="a", how="outer").with_columns(c=0).select("a", "c").collect() + df1.join(df2, on="a", how="full").with_columns(c=0).select("a", "c").collect() ).to_dict(as_series=False) == {"a": [1, 2, 3], "c": [0, 0, 0]}