Skip to content

Commit

Permalink
feat: Add stringview support to encode and decode and `bit_leng…
Browse files Browse the repository at this point in the history
…th` (#13332)

* add stringview

* add tests

* remove utf8view

* remove array_to_string changes

* remove use
  • Loading branch information
jonathanc-n authored Nov 17, 2024
1 parent 73507c3 commit 61fa572
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 8 deletions.
4 changes: 2 additions & 2 deletions datafusion/functions/src/core/named_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use std::any::Any;
use std::sync::{Arc, OnceLock};

/// put values in a struct array.
/// Put values in a struct array.
fn named_struct_expr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
// do not accept 0 arguments.
// Do not accept 0 arguments.
if args.is_empty() {
return exec_err!(
"named_struct requires at least one pair of arguments, got 0 instead"
Expand Down
20 changes: 14 additions & 6 deletions datafusion/functions/src/encoding/inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ impl ScalarUDFImpl for EncodeFunc {
}

match arg_types[0] {
DataType::Utf8 | DataType::Binary | DataType::Null => {
DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
Ok(vec![DataType::Utf8; 2])
}
DataType::LargeUtf8 | DataType::LargeBinary => {
Expand Down Expand Up @@ -195,7 +195,7 @@ impl ScalarUDFImpl for DecodeFunc {
}

match arg_types[0] {
DataType::Utf8 | DataType::Binary | DataType::Null => {
DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
Ok(vec![DataType::Binary, DataType::Utf8])
}
DataType::LargeUtf8 | DataType::LargeBinary => {
Expand Down Expand Up @@ -224,6 +224,7 @@ fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
ColumnarValue::Array(a) => match a.data_type() {
DataType::Utf8 => encoding.encode_utf8_array::<i32>(a.as_ref()),
DataType::LargeUtf8 => encoding.encode_utf8_array::<i64>(a.as_ref()),
DataType::Utf8View => encoding.encode_utf8_array::<i32>(a.as_ref()),
DataType::Binary => encoding.encode_binary_array::<i32>(a.as_ref()),
DataType::LargeBinary => encoding.encode_binary_array::<i64>(a.as_ref()),
other => exec_err!(
Expand All @@ -237,6 +238,9 @@ fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
}
ScalarValue::LargeUtf8(a) => Ok(encoding
.encode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes()))),
ScalarValue::Utf8View(a) => {
Ok(encoding.encode_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
}
ScalarValue::Binary(a) => Ok(
encoding.encode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
),
Expand All @@ -255,6 +259,7 @@ fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
ColumnarValue::Array(a) => match a.data_type() {
DataType::Utf8 => encoding.decode_utf8_array::<i32>(a.as_ref()),
DataType::LargeUtf8 => encoding.decode_utf8_array::<i64>(a.as_ref()),
DataType::Utf8View => encoding.decode_utf8_array::<i32>(a.as_ref()),
DataType::Binary => encoding.decode_binary_array::<i32>(a.as_ref()),
DataType::LargeBinary => encoding.decode_binary_array::<i64>(a.as_ref()),
other => exec_err!(
Expand All @@ -268,6 +273,9 @@ fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
}
ScalarValue::LargeUtf8(a) => encoding
.decode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes())),
ScalarValue::Utf8View(a) => {
encoding.decode_scalar(a.as_ref().map(|s: &String| s.as_bytes()))
}
ScalarValue::Binary(a) => {
encoding.decode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
}
Expand Down Expand Up @@ -512,7 +520,7 @@ impl FromStr for Encoding {
}
}

/// Encodes the given data, accepts Binary, LargeBinary, Utf8 or LargeUtf8 and returns a [`ColumnarValue`].
/// Encodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
/// Second argument is the encoding to use.
/// Standard encodings are base64 and hex.
fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
Expand All @@ -524,7 +532,7 @@ fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
}
let encoding = match &args[1] {
ColumnarValue::Scalar(scalar) => match scalar {
ScalarValue::Utf8(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
ScalarValue::Utf8(Some(method)) | ScalarValue::Utf8View(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
method.parse::<Encoding>()
}
_ => not_impl_err!(
Expand All @@ -538,7 +546,7 @@ fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
encode_process(&args[0], encoding)
}

/// Decodes the given data, accepts Binary, LargeBinary, Utf8 or LargeUtf8 and returns a [`ColumnarValue`].
/// Decodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
/// Second argument is the encoding to use.
/// Standard encodings are base64 and hex.
fn decode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
Expand All @@ -550,7 +558,7 @@ fn decode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
}
let encoding = match &args[1] {
ColumnarValue::Scalar(scalar) => match scalar {
ScalarValue::Utf8(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
ScalarValue::Utf8(Some(method)) | ScalarValue::Utf8View(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
method.parse::<Encoding>()
}
_ => not_impl_err!(
Expand Down
31 changes: 31 additions & 0 deletions datafusion/sqllogictest/test_files/encoding.slt
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,34 @@ select to_hex(num) from test ORDER BY num;
0
1
2

# test for Utf8View support for encode
statement ok
CREATE TABLE test_source AS VALUES
('Andrew', 'X'),
('Xiangpeng', 'Xiangpeng'),
('Raphael', 'R'),
(NULL, 'R');

statement ok
CREATE TABLE test_utf8view AS
select
arrow_cast(column1, 'Utf8View') AS column1_utf8view,
arrow_cast(column2, 'Utf8View') AS column2_utf8view
FROM test_source;

query TTTTTT
SELECT
column1_utf8view,
encode(column1_utf8view, 'base64') AS column1_base64,
encode(column1_utf8view, 'hex') AS column1_hex,

column2_utf8view,
encode(column2_utf8view, 'base64') AS column2_base64,
encode(column2_utf8view, 'hex') AS column2_hex
FROM test_utf8view;
----
Andrew QW5kcmV3 416e64726577 X WA 58
Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67
Raphael UmFwaGFlbA 5261706861656c R Ug 52
NULL NULL NULL R Ug 52
5 changes: 5 additions & 0 deletions datafusion/sqllogictest/test_files/expr.slt
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,11 @@ SELECT bit_length(NULL)
----
NULL

query I
SELECT bit_length(arrow_cast('jonathan', 'Utf8View'));
----
64

query T
SELECT btrim(' xyxtrimyyx ', NULL)
----
Expand Down

0 comments on commit 61fa572

Please sign in to comment.