Skip to content

Commit

Permalink
Strip Parquet file_group information from expected plans (#16)
Browse files Browse the repository at this point in the history
* ignore Parquet paths in tests

* remove file group info
  • Loading branch information
andygrove authored Oct 4, 2024
1 parent 2c8b8b8 commit f2500d1
Show file tree
Hide file tree
Showing 24 changed files with 168 additions and 443 deletions.
21 changes: 16 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,18 +120,29 @@ python -m pip install -r requirements-in.txt

Whenever rust code changes (your changes or via `git pull`):

````bash
```bash
# make sure you activate the venv using "source venv/bin/activate" first
maturin develop python -m pytest ```
maturin develop python -m pytest
```

## Testing

Running local Rust tests require generating the tpch-data. This can be done
by running the following command:
by running the following commands:

```bash
./scripts/generate_tpch_data.sh
export TPCH_TEST_PARTITIONS=1
export TPCH_SCALING_FACTOR=1
./scripts/gen-test-data.sh
```

This will generate data into a top-level `data` directory.

Tests can be run with:

```shell
export TPCH_DATA_PATH=`pwd`/data
cargo test
```

Tests compare plans with expected plans, which unfortunately contain the
Expand Down
44 changes: 0 additions & 44 deletions scripts/replace-expected-plans-paths.sh

This file was deleted.

19 changes: 10 additions & 9 deletions src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,9 @@ mod test {
}

async fn do_test(n: u8) -> TestResult<()> {
let data_path = env::var("TPCH_DATA_PATH")?;
let tpch_path_env_var = "TPCH_DATA_PATH";
let data_path = env::var(tpch_path_env_var).expect(&format!("Environment variable {} not found", tpch_path_env_var));

let file = format!("testdata/queries/q{n}.sql");
let sql = fs::read_to_string(&file)?;
let config = SessionConfig::new().with_target_partitions(1);
Expand Down Expand Up @@ -432,19 +434,18 @@ mod test {
displayable(query_stage.plan.as_ref()).indent(false)
));
}

// Remove Parquet file group information since it will vary between CI/CD and local
let re = Regex::new(r"file_groups=\{.*}")?;
let cleaned_output = re.replace_all(output.as_str(), "file_groups={ ... }");

let expected_file = format!("testdata/expected-plans/q{n}.txt");
if !Path::new(&expected_file).exists() {
fs::write(&expected_file, &output)?;
fs::write(&expected_file, &*cleaned_output)?;
}
let expected_plan = fs::read_to_string(&expected_file)?;

let re = Regex::new(r":[^]]*]")?;

// Remove the byte offsets from the plans, seems non repeatable
// between CI/CD and local
let cleaned_expected_plan = re.replace_all(&expected_plan, "]");
let cleaned_output = re.replace_all(&output, "]");
assert_eq!(cleaned_expected_plan, cleaned_output);
assert_eq!(expected_plan, cleaned_output);
Ok(())
}
}
4 changes: 2 additions & 2 deletions testdata/expected-plans/q1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ SortExec: expr=[l_returnflag@0 ASC NULLS LAST,l_linestatus@1 ASC NULLS LAST], pr
ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
CoalesceBatchesExec: target_batch_size=8192
FilterExec: l_shipdate@6 <= 1998-09-24
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/lineitem.parquet:0..208317955]]}, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[]
ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[]

RaySQL Plan
===========
Expand All @@ -29,5 +29,5 @@ SortExec: expr=[l_returnflag@0 ASC NULLS LAST,l_linestatus@1 ASC NULLS LAST], pr
ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus]
CoalesceBatchesExec: target_batch_size=8192
FilterExec: l_shipdate@6 <= 1998-09-24
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/lineitem.parquet:0..208317955]]}, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[]
ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[]

16 changes: 8 additions & 8 deletions testdata/expected-plans/q10.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,20 @@ SortExec: TopK(fetch=20), expr=[revenue@2 DESC], preserve_partitioning=[false]
ProjectionExec: expr=[c_custkey@1 as c_custkey, c_name@2 as c_name, c_address@3 as c_address, c_phone@4 as c_phone, c_acctbal@5 as c_acctbal, c_comment@6 as c_comment, l_extendedprice@7 as l_extendedprice, l_discount@8 as l_discount, n_name@0 as n_name]
CoalesceBatchesExec: target_batch_size=8192
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], projection=[n_name@1, c_custkey@2, c_name@3, c_address@4, c_phone@6, c_acctbal@7, c_comment@8, l_extendedprice@9, l_discount@10]
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/nation.parquet]]}, projection=[n_nationkey, n_name]
ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name]
CoalesceBatchesExec: target_batch_size=8192
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10]
CoalesceBatchesExec: target_batch_size=8192
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7]
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/customer.parquet:0..13211178]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment]
ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment]
ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey]
CoalesceBatchesExec: target_batch_size=8192
FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/orders.parquet:0..54530383]]}, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[]
ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[]
ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount]
CoalesceBatchesExec: target_batch_size=8192
FilterExec: l_returnflag@3 = R
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/lineitem.parquet:0..208317955]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)]
ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)]

RaySQL Plan
===========
Expand All @@ -54,18 +54,18 @@ SortExec: TopK(fetch=20), expr=[revenue@2 DESC], preserve_partitioning=[false]
ProjectionExec: expr=[c_custkey@1 as c_custkey, c_name@2 as c_name, c_address@3 as c_address, c_phone@4 as c_phone, c_acctbal@5 as c_acctbal, c_comment@6 as c_comment, l_extendedprice@7 as l_extendedprice, l_discount@8 as l_discount, n_name@0 as n_name]
CoalesceBatchesExec: target_batch_size=8192
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], projection=[n_name@1, c_custkey@2, c_name@3, c_address@4, c_phone@6, c_acctbal@7, c_comment@8, l_extendedprice@9, l_discount@10]
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/nation.parquet]]}, projection=[n_nationkey, n_name]
ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name]
CoalesceBatchesExec: target_batch_size=8192
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10]
CoalesceBatchesExec: target_batch_size=8192
HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7]
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/customer.parquet:0..13211178]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment]
ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment]
ProjectionExec: expr=[o_orderkey@0 as o_orderkey, o_custkey@1 as o_custkey]
CoalesceBatchesExec: target_batch_size=8192
FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/orders.parquet:0..54530383]]}, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[]
ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[]
ProjectionExec: expr=[l_orderkey@0 as l_orderkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount]
CoalesceBatchesExec: target_batch_size=8192
FilterExec: l_returnflag@3 = R
ParquetExec: file_groups={1 group: [[home/runner/work/datafusion-ray/datafusion-ray/data/lineitem.parquet:0..208317955]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)]
ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)]

Loading

0 comments on commit f2500d1

Please sign in to comment.