Skip to content

Commit

Permalink
chore: update datafusion and related crates (#1504)
Browse files Browse the repository at this point in the history
# Description

Updating datafusion and related crates to latest version. With the
updated object store, we unfortunately loose support for `aws-profile`.
Since object sore now also contains logic for parsing urls, that we
currently maintain here, I was planning on adopting these new APIs and
recovering profile support in a follow up PR. This will then also remove
the ignored deprecations from this PR.
  • Loading branch information
roeap authored Jul 5, 2023
1 parent 6650bd2 commit 56dfd25
Show file tree
Hide file tree
Showing 19 changed files with 123 additions and 100 deletions.
5 changes: 3 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ services:
test: [ "CMD", "curl", "-f", "http://localhost:4566/health" ]

fake-gcs:
image: fsouza/fake-gcs-server
command: ["-scheme", "http", "-port", "4443", "-external-url", "http://[::]:4443", "-backend", "memory"]
# Custom image - see fsouza/fake-gcs-server#1164
image: tustvold/fake-gcs-server
command: ["-scheme", "http", "-public-host", "localhost:4443", "-backend", "memory"]
ports:
- 4443:4443

Expand Down
4 changes: 2 additions & 2 deletions python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ doc = false
name = "deltalake._internal"

[dependencies]
arrow-schema = { version = "40", features = ["serde"] }
arrow-schema = { version = "42", features = ["serde"] }
chrono = "0"
env_logger = "0"
futures = "0.3"
Expand All @@ -35,7 +35,7 @@ num_cpus = "1"
reqwest = { version = "*", features = ["native-tls-vendored"] }

[dependencies.pyo3]
version = "0.18"
version = "0.19"
features = ["extension-module", "abi3", "abi3-py37"]

[dependencies.deltalake]
Expand Down
3 changes: 2 additions & 1 deletion python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ fn write_new_deltalake(
Ok(())
}

#[pyclass(name = "DeltaDataChecker", text_signature = "(invariants)")]
#[pyclass(name = "DeltaDataChecker")]
struct PyDeltaDataChecker {
inner: DeltaDataChecker,
rt: tokio::runtime::Runtime,
Expand All @@ -784,6 +784,7 @@ struct PyDeltaDataChecker {
#[pymethods]
impl PyDeltaDataChecker {
#[new]
#[pyo3(signature = (invariants))]
fn new(invariants: Vec<(String, String)>) -> Self {
let invariants: Vec<Invariant> = invariants
.into_iter()
Expand Down
24 changes: 8 additions & 16 deletions python/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult<SchemaDataType> {
/// * "decimal(<precision>, <scale>)"
///
/// :param data_type: string representation of the data type
#[pyclass(module = "deltalake.schema", text_signature = "(data_type)")]
#[pyclass(module = "deltalake.schema")]
#[derive(Clone)]
pub struct PrimitiveType {
inner_type: String,
Expand All @@ -132,6 +132,7 @@ impl TryFrom<SchemaDataType> for PrimitiveType {
#[pymethods]
impl PrimitiveType {
#[new]
#[pyo3(signature = (data_type))]
fn new(data_type: String) -> PyResult<Self> {
if data_type.starts_with("decimal") {
if try_parse_decimal_type(&data_type).is_none() {
Expand Down Expand Up @@ -246,10 +247,7 @@ impl PrimitiveType {
/// ArrayType(PrimitiveType("integer"), contains_null=True)
/// >>> ArrayType("integer", contains_null=False)
/// ArrayType(PrimitiveType("integer"), contains_null=False)
#[pyclass(
module = "deltalake.schema",
text_signature = "(element_type, contains_null=True)"
)]
#[pyclass(module = "deltalake.schema")]
#[derive(Clone)]
pub struct ArrayType {
inner_type: SchemaTypeArray,
Expand Down Expand Up @@ -411,10 +409,7 @@ impl ArrayType {
/// MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=True)
/// >>> MapType("integer", "string", value_contains_null=False)
/// MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=False)
#[pyclass(
module = "deltalake.schema",
text_signature = "(key_type, value_type, value_contains_null=True)"
)]
#[pyclass(module = "deltalake.schema")]
#[derive(Clone)]
pub struct MapType {
inner_type: SchemaTypeMap,
Expand Down Expand Up @@ -597,10 +592,7 @@ impl MapType {
///
/// >>> Field("my_col", "integer", metadata={"custom_metadata": {"test": 2}})
/// Field("my_col", PrimitiveType("integer"), nullable=True, metadata={"custom_metadata": {"test": 2}})
#[pyclass(
module = "deltalake.schema",
text_signature = "(name, type, nullable=True, metadata=None)"
)]
#[pyclass(module = "deltalake.schema")]
#[derive(Clone)]
pub struct Field {
inner: SchemaField,
Expand Down Expand Up @@ -778,7 +770,7 @@ impl Field {
///
/// >>> StructType([Field("x", "integer"), Field("y", "string")])
/// StructType([Field(x, PrimitiveType("integer"), nullable=True), Field(y, PrimitiveType("string"), nullable=True)])
#[pyclass(subclass, module = "deltalake.schema", text_signature = "(fields)")]
#[pyclass(subclass, module = "deltalake.schema")]
#[derive(Clone)]
pub struct StructType {
inner_type: SchemaTypeStruct,
Expand Down Expand Up @@ -951,13 +943,13 @@ pub fn schema_to_pyobject(schema: &Schema, py: Python) -> PyResult<PyObject> {
/// >>> import pyarrow as pa
/// >>> Schema.from_pyarrow(pa.schema({"x": pa.int32(), "y": pa.string()}))
/// Schema([Field(x, PrimitiveType("integer"), nullable=True), Field(y, PrimitiveType("string"), nullable=True)])
#[pyclass(extends = StructType, name = "Schema", module = "deltalake.schema",
text_signature = "(fields)")]
#[pyclass(extends = StructType, name = "Schema", module = "deltalake.schema")]
pub struct PySchema;

#[pymethods]
impl PySchema {
#[new]
#[pyo3(signature = (fields))]
fn new(fields: Vec<PyRef<Field>>) -> PyResult<(Self, StructType)> {
let fields: Vec<SchemaField> = fields
.into_iter()
Expand Down
40 changes: 19 additions & 21 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ readme = "README.md"
edition = "2021"

[dependencies]
arrow = { version = "40", optional = true }
arrow-array = { version = "40", optional = true }
arrow-buffer = { version = "40", optional = true }
arrow-cast = { version = "40", optional = true }
arrow-ord = { version = "40", optional = true }
arrow-row = { version = "40", optional = true }
arrow-schema = { version = "40", optional = true }
arrow-select = { version = "40", optional = true }
arrow = { version = "42", optional = true }
arrow-array = { version = "42", optional = true }
arrow-buffer = { version = "42", optional = true }
arrow-cast = { version = "42", optional = true }
arrow-ord = { version = "42", optional = true }
arrow-row = { version = "42", optional = true }
arrow-schema = { version = "42", optional = true }
arrow-select = { version = "42", optional = true }
async-trait = "0.1"
bytes = "1"
chrono = { version = "0.4.22", default-features = false, features = ["clock"] }
Expand All @@ -38,10 +38,10 @@ libc = ">=0.2.90, <1"
num-bigint = "0.4"
num_cpus = "1"
num-traits = "0.2.15"
object_store = "0.5.6"
object_store = "0.6.1"
once_cell = "1.16.0"
parking_lot = "0.12"
parquet = { version = "40", features = [
parquet = { version = "42", features = [
"async",
"object_store",
], optional = true }
Expand All @@ -50,7 +50,7 @@ percent-encoding = "2"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
thiserror = "1"
tokio = { version = "1", features = ["macros", "rt", "parking_lot"] }
tokio = { version = "1.28", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] }
regex = "1"
uuid = { version = "1", features = ["serde", "v4"] }
url = "2.3"
Expand All @@ -65,7 +65,7 @@ rusoto_dynamodb = { version = "0.47", default-features = false, optional = true
rusoto_glue = { version = "0.47", default-features = false, optional = true }

# Unity
reqwest = { version = "0.11", default-features = false, features = [
reqwest = { version = "0.11.18", default-features = false, features = [
"rustls-tls",
"json",
], optional = true }
Expand All @@ -74,15 +74,15 @@ reqwest-retry = { version = "0.2.2", optional = true }

# Datafusion
dashmap = { version = "5", optional = true }
datafusion = { version = "26", optional = true }
datafusion-expr = { version = "26", optional = true }
datafusion-common = { version = "26", optional = true }
datafusion-proto = { version = "26", optional = true }
datafusion-sql = { version = "26", optional = true }
datafusion-physical-expr = { version = "26", optional = true }
datafusion = { version = "27", optional = true }
datafusion-expr = { version = "27", optional = true }
datafusion-common = { version = "27", optional = true }
datafusion-proto = { version = "27", optional = true }
datafusion-sql = { version = "27", optional = true }
datafusion-physical-expr = { version = "27", optional = true }


sqlparser = { version = "0.34", optional = true }
sqlparser = { version = "0.35", optional = true }

# NOTE dependencies only for integration tests
fs_extra = { version = "1.2.0", optional = true }
Expand Down Expand Up @@ -135,7 +135,6 @@ s3-native-tls = [
"rusoto_dynamodb/native-tls",
"dynamodb_lock/native-tls",
"object_store/aws",
"object_store/aws_profile",
]
s3 = [
"rusoto_core/rustls",
Expand All @@ -144,7 +143,6 @@ s3 = [
"rusoto_dynamodb/rustls",
"dynamodb_lock/rustls",
"object_store/aws",
"object_store/aws_profile",
]
unity-experimental = ["reqwest", "reqwest-middleware", "reqwest-retry"]

Expand Down
2 changes: 2 additions & 0 deletions rust/src/action/checkpoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ pub async fn cleanup_expired_logs_for(
location: Path::from(""),
last_modified: DateTime::<Utc>::MIN_UTC,
size: 0,
e_tag: None,
},
);
let file_needs_time_adjustment =
Expand Down Expand Up @@ -255,6 +256,7 @@ pub async fn cleanup_expired_logs_for(
location: current_file.1.location.clone(),
last_modified: last_file.1.last_modified.add(Duration::seconds(1)),
size: 0,
e_tag: None,
},
);
maybe_delete_files.push(updated);
Expand Down
2 changes: 1 addition & 1 deletion rust/src/data_catalog/storage/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ impl SchemaProvider for ListingSchemaProvider {
mod tests {
use super::*;
use datafusion::assert_batches_sorted_eq;
use datafusion::catalog::catalog::{CatalogProvider, MemoryCatalogProvider};
use datafusion::catalog::{CatalogProvider, MemoryCatalogProvider};
use datafusion::execution::context::SessionContext;

#[test]
Expand Down
10 changes: 5 additions & 5 deletions rust/src/delta_datafusion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@ use arrow_array::StringArray;
use arrow_schema::Field;
use async_trait::async_trait;
use chrono::{DateTime, NaiveDateTime, Utc};
use datafusion::datasource::datasource::TableProviderFactory;
use datafusion::datasource::file_format::{parquet::ParquetFormat, FileFormat};
use datafusion::datasource::physical_plan::FileScanConfig;
use datafusion::datasource::provider::TableProviderFactory;
use datafusion::datasource::{listing::PartitionedFile, MemTable, TableProvider, TableType};
use datafusion::execution::context::{SessionContext, SessionState, TaskContext};
use datafusion::execution::runtime_env::RuntimeEnv;
use datafusion::execution::FunctionRegistry;
use datafusion::optimizer::utils::conjunction;
use datafusion::physical_expr::PhysicalSortExpr;
use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
use datafusion::physical_plan::file_format::FileScanConfig;
use datafusion::physical_plan::filter::FilterExec;
use datafusion::physical_plan::limit::LocalLimitExec;
use datafusion::physical_plan::{
Expand Down Expand Up @@ -1377,7 +1377,6 @@ mod tests {
use arrow::array::StructArray;
use arrow::datatypes::{DataType, Field, Schema};
use chrono::{TimeZone, Utc};
use datafusion::from_slice::FromSlice;
use datafusion::physical_plan::empty::EmptyExec;
use datafusion_proto::physical_plan::AsExecutionPlan;
use datafusion_proto::protobuf;
Expand Down Expand Up @@ -1558,6 +1557,7 @@ mod tests {
location: Path::from("year=2015/month=1/part-00000-4dcb50d3-d017-450c-9df7-a7257dbd3c5d-c000.snappy.parquet".to_string()),
last_modified: Utc.timestamp_millis_opt(1660497727833).unwrap(),
size: 10644,
e_tag: None
},
partition_values: [ScalarValue::Int64(Some(2015)), ScalarValue::Int64(Some(1))].to_vec(),
range: None,
Expand All @@ -1575,8 +1575,8 @@ mod tests {
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(arrow::array::StringArray::from_slice(["a", "b", "c", "d"])),
Arc::new(arrow::array::Int32Array::from_slice([1, 10, 10, 100])),
Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c", "d"])),
Arc::new(arrow::array::Int32Array::from(vec![1, 10, 10, 100])),
],
)
.unwrap();
Expand Down
31 changes: 15 additions & 16 deletions rust/src/operations/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,6 @@ mod tests {
use arrow::datatypes::{Field, Schema};
use arrow::record_batch::RecordBatch;
use datafusion::assert_batches_sorted_eq;
use datafusion::from_slice::FromSlice;
use datafusion::prelude::*;
use std::sync::Arc;

Expand All @@ -358,9 +357,9 @@ mod tests {
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from_slice([1, 10, 10, 100])),
Arc::new(arrow::array::StringArray::from_slice([
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from(vec![1, 10, 10, 100])),
Arc::new(arrow::array::StringArray::from(vec![
"2021-02-02",
"2021-02-02",
"2021-02-02",
Expand Down Expand Up @@ -411,9 +410,9 @@ mod tests {
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from_slice([1, 10, 10, 100])),
Arc::new(arrow::array::StringArray::from_slice([
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from(vec![1, 10, 10, 100])),
Arc::new(arrow::array::StringArray::from(vec![
"2021-02-02",
"2021-02-02",
"2021-02-02",
Expand All @@ -435,9 +434,9 @@ mod tests {
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from_slice([0, 20, 10, 100])),
Arc::new(arrow::array::StringArray::from_slice([
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from(vec![0, 20, 10, 100])),
Arc::new(arrow::array::StringArray::from(vec![
"2021-02-02",
"2021-02-02",
"2021-02-02",
Expand Down Expand Up @@ -586,9 +585,9 @@ mod tests {
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from_slice([0, 20, 10, 100])),
Arc::new(arrow::array::StringArray::from_slice([
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from(vec![0, 20, 10, 100])),
Arc::new(arrow::array::StringArray::from(vec![
"2021-02-02",
"2021-02-03",
"2021-02-02",
Expand Down Expand Up @@ -644,9 +643,9 @@ mod tests {
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(arrow::array::StringArray::from_slice(["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from_slice([0, 20, 10, 100])),
Arc::new(arrow::array::StringArray::from_slice([
Arc::new(arrow::array::StringArray::from(vec!["A", "B", "A", "A"])),
Arc::new(arrow::array::Int32Array::from(vec![0, 20, 10, 100])),
Arc::new(arrow::array::StringArray::from(vec![
"2021-02-02",
"2021-02-03",
"2021-02-02",
Expand Down
6 changes: 5 additions & 1 deletion rust/src/operations/transaction/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ use arrow::array::ArrayRef;
use arrow::datatypes::{
DataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef,
};
use datafusion::datasource::physical_plan::wrap_partition_type_in_dict;
use datafusion::optimizer::utils::conjunction;
use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
use datafusion::physical_plan::file_format::wrap_partition_type_in_dict;
use datafusion_common::config::ConfigOptions;
use datafusion_common::scalar::ScalarValue;
use datafusion_common::{Column, DFSchema, Result as DFResult, TableReference};
Expand Down Expand Up @@ -362,6 +362,10 @@ impl ContextProvider for DummyContextProvider {
fn options(&self) -> &ConfigOptions {
&self.options
}

fn get_window_meta(&self, _name: &str) -> Option<Arc<datafusion_expr::WindowUDF>> {
unimplemented!()
}
}

#[cfg(test)]
Expand Down
Loading

0 comments on commit 56dfd25

Please sign in to comment.