From 91fae8952b86b54fdf7c40b9c46e5193c81a861f Mon Sep 17 00:00:00 2001 From: Jiashen Cao Date: Thu, 14 Nov 2024 10:24:37 -0800 Subject: [PATCH 1/5] update --- datafusion/functions/src/unicode/left.rs | 11 +++-------- datafusion/functions/src/unicode/lpad.rs | 22 ++++++++-------------- datafusion/functions/src/unicode/right.rs | 11 +++-------- datafusion/functions/src/unicode/rpad.rs | 22 ++++++++-------------- 4 files changed, 22 insertions(+), 44 deletions(-) diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index a6c2b9768f0b..442093154618 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -30,9 +30,9 @@ use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; use datafusion_common::exec_err; +use datafusion_common::types::{logical_int64, logical_string}; use datafusion_common::Result; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -50,14 +50,9 @@ impl Default for LeftFunc { impl LeftFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8, Int64]), - Exact(vec![LargeUtf8, Int64]), - ], + signature: Signature::coercible( + vec![logical_string(), logical_int64()], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index a639bcedcd1f..46b56c36321c 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -24,6 +24,7 @@ use arrow::array::{ OffsetSizeTrait, StringViewArray, }; use arrow::datatypes::DataType; +use datafusion_common::types::{logical_int64, logical_string}; use unicode_segmentation::UnicodeSegmentation; use DataType::{LargeUtf8, Utf8, Utf8View}; @@ -32,7 +33,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::TypeSignature; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -50,22 +51,15 @@ impl Default for LPadFunc { impl LPadFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8View, Int64, Utf8View]), - Exact(vec![Utf8View, Int64, Utf8]), - Exact(vec![Utf8View, Int64, LargeUtf8]), - Exact(vec![Utf8, Int64]), - Exact(vec![Utf8, Int64, Utf8View]), - Exact(vec![Utf8, Int64, Utf8]), - Exact(vec![Utf8, Int64, LargeUtf8]), - Exact(vec![LargeUtf8, Int64]), - Exact(vec![LargeUtf8, Int64, Utf8View]), - Exact(vec![LargeUtf8, Int64, Utf8]), - Exact(vec![LargeUtf8, Int64, LargeUtf8]), + TypeSignature::Coercible(vec![logical_string(), logical_int64()]), + TypeSignature::Coercible(vec![ + logical_string(), + logical_int64(), + logical_string(), + ]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index ab3b7ba1a27e..9295e6d5780b 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -30,9 +30,9 @@ use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; use datafusion_common::exec_err; +use datafusion_common::types::{logical_int64, logical_string}; use datafusion_common::Result; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -50,14 +50,9 @@ impl Default for RightFunc { impl RightFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8, Int64]), - Exact(vec![LargeUtf8, Int64]), - ], + signature: Signature::coercible( + vec![logical_string(), logical_int64()], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index bd9d625105e9..f9f640f9aa51 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -23,10 +23,11 @@ use arrow::array::{ }; use arrow::datatypes::DataType; use datafusion_common::cast::as_int64_array; +use datafusion_common::types::{logical_int64, logical_string}; use datafusion_common::DataFusionError; use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::TypeSignature; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -49,22 +50,15 @@ impl Default for RPadFunc { impl RPadFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8View, Int64, Utf8View]), - Exact(vec![Utf8View, Int64, Utf8]), - Exact(vec![Utf8View, Int64, LargeUtf8]), - Exact(vec![Utf8, Int64]), - Exact(vec![Utf8, Int64, Utf8View]), - Exact(vec![Utf8, Int64, Utf8]), - Exact(vec![Utf8, Int64, LargeUtf8]), - Exact(vec![LargeUtf8, Int64]), - Exact(vec![LargeUtf8, Int64, Utf8View]), - Exact(vec![LargeUtf8, Int64, Utf8]), - Exact(vec![LargeUtf8, Int64, LargeUtf8]), + TypeSignature::Coercible(vec![logical_string(), logical_int64()]), + TypeSignature::Coercible(vec![ + logical_string(), + logical_int64(), + logical_string(), + ]), ], Volatility::Immutable, ), From 89988d81c418e807165eed4a0b353ae238732883 Mon Sep 17 00:00:00 2001 From: Jiashen Cao Date: Thu, 14 Nov 2024 18:11:02 -0800 Subject: [PATCH 2/5] update --- datafusion/sqllogictest/test_files/scalar.slt | 4 ++-- datafusion/sqllogictest/test_files/string/string_view.slt | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index fe7d1a90c5bd..3370bd854c7d 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1864,10 +1864,10 @@ query TT EXPLAIN SELECT letter, letter = LEFT(letter2, 1) FROM simple_string; ---- logical_plan -01)Projection: simple_string.letter, simple_string.letter = left(simple_string.letter2, Int64(1)) +01)Projection: simple_string.letter, simple_string.letter = left(CAST(simple_string.letter2 AS Utf8View), Int64(1)) 02)--TableScan: simple_string projection=[letter, letter2] physical_plan -01)ProjectionExec: expr=[letter@0 as letter, letter@0 = left(letter2@1, 1) as simple_string.letter = left(simple_string.letter2,Int64(1))] +01)ProjectionExec: expr=[letter@0 as letter, letter@0 = left(CAST(letter2@1 AS Utf8View), 1) as simple_string.letter = left(simple_string.letter2,Int64(1))] 02)--MemoryExec: partitions=1, partition_sizes=[1] query TB diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 2f4af80a9257..75be03278df0 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -663,7 +663,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: lpad(test.column1_utf8view, Int64(12), Utf8(" ")) AS c1 +01)Projection: lpad(test.column1_utf8view, Int64(12), Utf8View(" ")) AS c1 02)--TableScan: test projection=[column1_utf8view] query TT @@ -672,7 +672,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: lpad(test.column1_utf8view, Int64(12), test.column2_large_utf8) AS c1 +01)Projection: lpad(test.column1_utf8view, Int64(12), CAST(test.column2_large_utf8 AS Utf8View)) AS c1 02)--TableScan: test projection=[column2_large_utf8, column1_utf8view] query TT @@ -826,7 +826,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: rpad(test.column1_utf8view, Int64(12), test.column2_large_utf8) AS c1 +01)Projection: rpad(test.column1_utf8view, Int64(12), CAST(test.column2_large_utf8 AS Utf8View)) AS c1 02)--TableScan: test projection=[column2_large_utf8, column1_utf8view] query TT From 6723ad039575085d544f097318ac8dc7dbc0408b Mon Sep 17 00:00:00 2001 From: Jiashen Cao Date: Fri, 15 Nov 2024 11:52:36 -0800 Subject: [PATCH 3/5] no casting if the logical types are the same --- datafusion/expr/src/type_coercion/functions.rs | 16 ++++++++++++++-- datafusion/sqllogictest/test_files/jctest.slt | 4 ++++ datafusion/sqllogictest/test_files/scalar.slt | 4 ++-- .../test_files/string/string_view.slt | 6 +++--- 4 files changed, 23 insertions(+), 7 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/jctest.slt diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 6836713d8016..bf3bd07949ce 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -44,7 +44,9 @@ pub fn data_types_with_scalar_udf( current_types: &[DataType], func: &ScalarUDF, ) -> Result> { + println!("func - {:?}", func); let signature = func.signature(); + println!("sig - {:?}", signature); if current_types.is_empty() { if signature.type_signature.supports_zero_argument() { @@ -56,6 +58,7 @@ pub fn data_types_with_scalar_udf( let valid_types = get_valid_types_with_scalar_udf(&signature.type_signature, current_types, func)?; + println!("validT - {:?}", valid_types); if valid_types .iter() @@ -544,11 +547,20 @@ fn get_valid_types( for (current_type, target_type) in current_types.iter().zip(target_types.iter()) { + println!("curT - {:?}", current_type); + println!("targetT - {:?}", target_type); let logical_type: NativeType = current_type.into(); let target_logical_type = target_type.native(); + println!("cur_logiT - {:?}", logical_type); + println!("target_logiT - {:?}", target_logical_type); if can_coerce_to(&logical_type, target_logical_type) { - let target_type = - target_logical_type.default_cast_for(current_type)?; + let target_type = if &logical_type == target_logical_type { + current_type.clone() + } else { + target_logical_type.default_cast_for(current_type)? + }; + // let target_type = target_logical_type.default_cast_for(current_type)?; + println!("new targetT - {:?}", target_type); new_types.push(target_type); } } diff --git a/datafusion/sqllogictest/test_files/jctest.slt b/datafusion/sqllogictest/test_files/jctest.slt new file mode 100644 index 000000000000..a3d6fb40ff1f --- /dev/null +++ b/datafusion/sqllogictest/test_files/jctest.slt @@ -0,0 +1,4 @@ +query T +SELECT left(arrow_cast('abcde', 'Dictionary(Int32, Utf8)'), -2) +---- +abc \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 3370bd854c7d..fe7d1a90c5bd 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1864,10 +1864,10 @@ query TT EXPLAIN SELECT letter, letter = LEFT(letter2, 1) FROM simple_string; ---- logical_plan -01)Projection: simple_string.letter, simple_string.letter = left(CAST(simple_string.letter2 AS Utf8View), Int64(1)) +01)Projection: simple_string.letter, simple_string.letter = left(simple_string.letter2, Int64(1)) 02)--TableScan: simple_string projection=[letter, letter2] physical_plan -01)ProjectionExec: expr=[letter@0 as letter, letter@0 = left(CAST(letter2@1 AS Utf8View), 1) as simple_string.letter = left(simple_string.letter2,Int64(1))] +01)ProjectionExec: expr=[letter@0 as letter, letter@0 = left(letter2@1, 1) as simple_string.letter = left(simple_string.letter2,Int64(1))] 02)--MemoryExec: partitions=1, partition_sizes=[1] query TB diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 75be03278df0..2f4af80a9257 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -663,7 +663,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: lpad(test.column1_utf8view, Int64(12), Utf8View(" ")) AS c1 +01)Projection: lpad(test.column1_utf8view, Int64(12), Utf8(" ")) AS c1 02)--TableScan: test projection=[column1_utf8view] query TT @@ -672,7 +672,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: lpad(test.column1_utf8view, Int64(12), CAST(test.column2_large_utf8 AS Utf8View)) AS c1 +01)Projection: lpad(test.column1_utf8view, Int64(12), test.column2_large_utf8) AS c1 02)--TableScan: test projection=[column2_large_utf8, column1_utf8view] query TT @@ -826,7 +826,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: rpad(test.column1_utf8view, Int64(12), CAST(test.column2_large_utf8 AS Utf8View)) AS c1 +01)Projection: rpad(test.column1_utf8view, Int64(12), test.column2_large_utf8) AS c1 02)--TableScan: test projection=[column2_large_utf8, column1_utf8view] query TT From 30f4ac9dbe586c94ba97907e473474b9b20335df Mon Sep 17 00:00:00 2001 From: Jiashen Cao Date: Sun, 17 Nov 2024 11:04:46 -0800 Subject: [PATCH 4/5] support dict type in left --- datafusion/functions/src/unicode/left.rs | 32 ++++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index 442093154618..5ebd90cd6ae8 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -17,20 +17,21 @@ use std::any::Any; use std::cmp::Ordering; +use std::string; use std::sync::{Arc, OnceLock}; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, - OffsetSizeTrait, + as_dictionary_array, Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, + Int64Array, OffsetSizeTrait, }; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Int32Type}; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; use datafusion_common::exec_err; -use datafusion_common::types::{logical_int64, logical_string}; +use datafusion_common::types::{logical_int64, logical_string, TypeSignature}; use datafusion_common::Result; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::{ @@ -82,6 +83,10 @@ impl ScalarUDFImpl for LeftFunc { make_scalar_function(left::, vec![])(args) } DataType::LargeUtf8 => make_scalar_function(left::, vec![])(args), + DataType::Dictionary(_, v) => match &*v { + DataType::Utf8 => make_scalar_function(left::, vec![])(args), + _ => exec_err!("Unsupported data type {v:?}"), + }, other => exec_err!( "Unsupported data type {other:?} for function left,\ expected Utf8View, Utf8 or LargeUtf8." @@ -127,9 +132,26 @@ pub fn left(args: &[ArrayRef]) -> Result { if args[0].data_type() == &DataType::Utf8View { let string_array = as_string_view_array(&args[0])?; left_impl::(string_array, n_array) - } else { + } else if args[0].data_type() == &DataType::Utf8 + || args[0].data_type() == &DataType::LargeUtf8 + { let string_array = as_generic_string_array::(&args[0])?; left_impl::(string_array, n_array) + } else { + let dict_array = match &args[0].data_type() { + DataType::Dictionary(k, _) => match **k { + DataType::Int32 => Ok(as_dictionary_array::(&args[0])), + _ => exec_err!("Unsupported Dictionary key type {k:?}"), + }, + _ => exec_err!("Unsupported type {:?}", &args[0].data_type()), + } + .unwrap(); + let string_array = dict_array + .values() + .as_any() + .downcast_ref::>() + .unwrap(); + left_impl::(string_array, n_array) } } From c66cee64e43483c94eaefc28ed8f41daeae4a23b Mon Sep 17 00:00:00 2001 From: Jiashen Cao Date: Sun, 17 Nov 2024 11:05:19 -0800 Subject: [PATCH 5/5] remove print --- datafusion/expr/src/type_coercion/functions.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index bf3bd07949ce..be83e5bb7f41 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -44,9 +44,7 @@ pub fn data_types_with_scalar_udf( current_types: &[DataType], func: &ScalarUDF, ) -> Result> { - println!("func - {:?}", func); let signature = func.signature(); - println!("sig - {:?}", signature); if current_types.is_empty() { if signature.type_signature.supports_zero_argument() { @@ -58,7 +56,6 @@ pub fn data_types_with_scalar_udf( let valid_types = get_valid_types_with_scalar_udf(&signature.type_signature, current_types, func)?; - println!("validT - {:?}", valid_types); if valid_types .iter() @@ -547,12 +544,8 @@ fn get_valid_types( for (current_type, target_type) in current_types.iter().zip(target_types.iter()) { - println!("curT - {:?}", current_type); - println!("targetT - {:?}", target_type); let logical_type: NativeType = current_type.into(); let target_logical_type = target_type.native(); - println!("cur_logiT - {:?}", logical_type); - println!("target_logiT - {:?}", target_logical_type); if can_coerce_to(&logical_type, target_logical_type) { let target_type = if &logical_type == target_logical_type { current_type.clone() @@ -560,7 +553,6 @@ fn get_valid_types( target_logical_type.default_cast_for(current_type)? }; // let target_type = target_logical_type.default_cast_for(current_type)?; - println!("new targetT - {:?}", target_type); new_types.push(target_type); } }