From 23e3ab1b2733565b3627d2c2577aba4570369024 Mon Sep 17 00:00:00 2001 From: Faisal Date: Mon, 6 Jan 2025 11:51:53 -0500 Subject: [PATCH] Ruff linting fix (#365) * ruff fixes for typing * bump version: v0.16.0 --- datacompy/__init__.py | 2 +- datacompy/base.py | 4 ++-- datacompy/core.py | 10 +++++----- datacompy/fugue.py | 24 ++++++++++++------------ datacompy/polars.py | 12 ++++++------ datacompy/snowflake.py | 20 +++++++++++--------- datacompy/spark/legacy.py | 26 +++++++++++++------------- datacompy/spark/pandas.py | 10 +++++----- datacompy/spark/sql.py | 10 +++++----- 9 files changed, 60 insertions(+), 58 deletions(-) diff --git a/datacompy/__init__.py b/datacompy/__init__.py index 9f518084..8f5651c6 100644 --- a/datacompy/__init__.py +++ b/datacompy/__init__.py @@ -18,7 +18,7 @@ Then extended to carry that functionality over to Spark Dataframes. """ -__version__ = "0.15.0" +__version__ = "0.16.0" import platform from warnings import warn diff --git a/datacompy/base.py b/datacompy/base.py index d8ee6f96..27425da6 100644 --- a/datacompy/base.py +++ b/datacompy/base.py @@ -22,7 +22,7 @@ """ from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any from ordered_set import OrderedSet @@ -154,7 +154,7 @@ def report( self, sample_count: int = 10, column_count: int = 10, - html_file: Optional[str] = None, + html_file: str | None = None, ) -> str: """Return a string representation of a report.""" pass diff --git a/datacompy/core.py b/datacompy/core.py index a7cdcca2..b0d10d77 100644 --- a/datacompy/core.py +++ b/datacompy/core.py @@ -22,7 +22,7 @@ """ import os -from typing import Any, Dict, List, Optional, Union, cast +from typing import Any, Dict, List, cast import numpy as np import pandas as pd @@ -84,7 +84,7 @@ def __init__( self, df1: pd.DataFrame, df2: pd.DataFrame, - join_columns: Optional[Union[List[str], str]] = None, + join_columns: List[str] | str | None = None, on_index: bool = False, abs_tol: float = 0, rel_tol: float = 0, @@ -100,7 +100,7 @@ def __init__( elif on_index: self.on_index = True self.join_columns = [] - elif isinstance(join_columns, (str, int, float)): + elif isinstance(join_columns, str | int | float): self.join_columns = [ str(join_columns).lower() if self.cast_column_names_lower @@ -564,7 +564,7 @@ def report( self, sample_count: int = 10, column_count: int = 10, - html_file: Optional[str] = None, + html_file: str | None = None, ) -> str: """Return a string representation of a report. @@ -728,7 +728,7 @@ def df_to_str(pdf: pd.DataFrame) -> str: return report -def render(filename: str, *fields: Union[int, float, str]) -> str: +def render(filename: str, *fields: int | float | str) -> str: """Render out an individual template. This basically just reads in a diff --git a/datacompy/fugue.py b/datacompy/fugue.py index fff14ba3..57d097a9 100644 --- a/datacompy/fugue.py +++ b/datacompy/fugue.py @@ -17,7 +17,7 @@ import pickle from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import Any, Callable, Dict, Iterable, List, Tuple, cast import pandas as pd from ordered_set import OrderedSet @@ -105,7 +105,7 @@ def all_columns_match(df1: "AnyDataFrame", df2: "AnyDataFrame") -> bool: def is_match( df1: "AnyDataFrame", df2: "AnyDataFrame", - join_columns: Union[str, List[str]], + join_columns: str | List[str], abs_tol: float = 0, rel_tol: float = 0, df1_name: str = "df1", @@ -113,7 +113,7 @@ def is_match( ignore_spaces: bool = False, ignore_case: bool = False, cast_column_names_lower: bool = True, - parallelism: Optional[int] = None, + parallelism: int | None = None, strict_schema: bool = False, ) -> bool: """Check whether two dataframes match. @@ -204,7 +204,7 @@ def is_match( def all_rows_overlap( df1: "AnyDataFrame", df2: "AnyDataFrame", - join_columns: Union[str, List[str]], + join_columns: str | List[str], abs_tol: float = 0, rel_tol: float = 0, df1_name: str = "df1", @@ -212,7 +212,7 @@ def all_rows_overlap( ignore_spaces: bool = False, ignore_case: bool = False, cast_column_names_lower: bool = True, - parallelism: Optional[int] = None, + parallelism: int | None = None, strict_schema: bool = False, ) -> bool: """Check if the rows are all present in both dataframes. @@ -300,7 +300,7 @@ def all_rows_overlap( def count_matching_rows( df1: "AnyDataFrame", df2: "AnyDataFrame", - join_columns: Union[str, List[str]], + join_columns: str | List[str], abs_tol: float = 0, rel_tol: float = 0, df1_name: str = "df1", @@ -308,7 +308,7 @@ def count_matching_rows( ignore_spaces: bool = False, ignore_case: bool = False, cast_column_names_lower: bool = True, - parallelism: Optional[int] = None, + parallelism: int | None = None, strict_schema: bool = False, ) -> int: """Count the number of rows match (on overlapping fields). @@ -395,7 +395,7 @@ def count_matching_rows( def report( df1: "AnyDataFrame", df2: "AnyDataFrame", - join_columns: Union[str, List[str]], + join_columns: str | List[str], abs_tol: float = 0, rel_tol: float = 0, df1_name: str = "df1", @@ -405,8 +405,8 @@ def report( cast_column_names_lower: bool = True, sample_count: int = 10, column_count: int = 10, - html_file: Optional[str] = None, - parallelism: Optional[int] = None, + html_file: str | None = None, + parallelism: int | None = None, ) -> str: """Return a string representation of a report. @@ -648,7 +648,7 @@ def _any(col: str) -> int: def _distributed_compare( df1: "AnyDataFrame", df2: "AnyDataFrame", - join_columns: Union[str, List[str]], + join_columns: str | List[str], return_obj_func: Callable[[Compare], Any], abs_tol: float = 0, rel_tol: float = 0, @@ -657,7 +657,7 @@ def _distributed_compare( ignore_spaces: bool = False, ignore_case: bool = False, cast_column_names_lower: bool = True, - parallelism: Optional[int] = None, + parallelism: int | None = None, strict_schema: bool = False, ) -> List[Any]: """Compare the data distributively using the core Compare class. diff --git a/datacompy/polars.py b/datacompy/polars.py index d6300b8b..1197ae20 100644 --- a/datacompy/polars.py +++ b/datacompy/polars.py @@ -23,7 +23,7 @@ import os from copy import deepcopy -from typing import Any, Dict, List, Optional, Union, cast +from typing import Any, Dict, List, cast import numpy as np import polars as pl @@ -85,7 +85,7 @@ def __init__( self, df1: "pl.DataFrame", df2: "pl.DataFrame", - join_columns: Union[List[str], str], + join_columns: List[str] | str, abs_tol: float = 0, rel_tol: float = 0, df1_name: str = "df1", @@ -327,8 +327,8 @@ def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None: creates a column column_match which is True for matches, False otherwise. """ - match_cnt: Union[int, float] - null_diff: Union[int, float] + match_cnt: int | float + null_diff: int | float LOG.debug("Comparing intersection") row_cnt = len(self.intersect_rows) @@ -571,7 +571,7 @@ def report( self, sample_count: int = 10, column_count: int = 10, - html_file: Optional[str] = None, + html_file: str | None = None, ) -> str: """Return a string representation of a report. @@ -734,7 +734,7 @@ def df_to_str(pdf: "pl.DataFrame") -> str: return report -def render(filename: str, *fields: Union[int, float, str]) -> str: +def render(filename: str, *fields: int | float | str) -> str: """Render out an individual template. This basically just reads in a diff --git a/datacompy/snowflake.py b/datacompy/snowflake.py index 007f0023..1cd3247a 100644 --- a/datacompy/snowflake.py +++ b/datacompy/snowflake.py @@ -24,7 +24,7 @@ import os from concurrent.futures import ThreadPoolExecutor, as_completed from copy import deepcopy -from typing import Any, Dict, List, Optional, Union, cast +from typing import Any, Dict, List, Union, cast import pandas as pd from ordered_set import OrderedSet @@ -115,11 +115,11 @@ def __init__( session: "sp.Session", df1: Union[str, "sp.DataFrame"], df2: Union[str, "sp.DataFrame"], - join_columns: Optional[Union[List[str], str]], + join_columns: List[str] | str | None, abs_tol: float = 0, rel_tol: float = 0, - df1_name: Optional[str] = None, - df2_name: Optional[str] = None, + df1_name: str | None = None, + df2_name: str | None = None, ignore_spaces: bool = False, ) -> None: if join_columns is None: @@ -128,7 +128,7 @@ def __init__( elif not join_columns: errmsg = "join_columns is empty" raise ValueError(errmsg) - elif isinstance(join_columns, (str, int, float)): + elif isinstance(join_columns, str | int | float): self.join_columns = [str(join_columns).replace('"', "").upper()] else: self.join_columns = [ @@ -155,7 +155,7 @@ def df1(self) -> "sp.DataFrame": return self._df1 @df1.setter - def df1(self, df1: tuple[Union[str, "sp.DataFrame"], Optional[str]]) -> None: + def df1(self, df1: tuple[Union[str, "sp.DataFrame"], str | None]) -> None: """Check that df1 is either a Snowpark DF or the name of a valid Snowflake table.""" (df, df_name) = df1 if isinstance(df, str): @@ -176,7 +176,7 @@ def df2(self) -> "sp.DataFrame": return self._df2 @df2.setter - def df2(self, df2: tuple[Union[str, "sp.DataFrame"], Optional[str]]) -> None: + def df2(self, df2: tuple[Union[str, "sp.DataFrame"], str | None]) -> None: """Check that df2 is either a Snowpark DF or the name of a valid Snowflake table.""" (df, df_name) = df2 if isinstance(df, str): @@ -215,6 +215,7 @@ def _validate_dataframe(self, df_name: str, index: str) -> None: zip( self._df1.columns, [str(c).replace('"', "").upper() for c in self._df1.columns], + strict=False, ) ) self._df1 = self._df1.rename(col_map) @@ -223,6 +224,7 @@ def _validate_dataframe(self, df_name: str, index: str) -> None: zip( self._df2.columns, [str(c).replace('"', "").upper() for c in self._df2.columns], + strict=False, ) ) self._df2 = self._df2.rename(dict(col_map)) @@ -711,7 +713,7 @@ def report( self, sample_count: int = 10, column_count: int = 10, - html_file: Optional[str] = None, + html_file: str | None = None, ) -> str: """Return a string representation of a report. @@ -876,7 +878,7 @@ def report( return report -def render(filename: str, *fields: Union[int, float, str]) -> str: +def render(filename: str, *fields: int | float | str) -> str: """Render out an individual template. This basically just reads in a diff --git a/datacompy/spark/legacy.py b/datacompy/spark/legacy.py index 3555b8b0..c32db3b7 100644 --- a/datacompy/spark/legacy.py +++ b/datacompy/spark/legacy.py @@ -17,7 +17,7 @@ import sys from enum import Enum from itertools import chain -from typing import Any, Dict, List, Optional, Set, TextIO, Tuple, Union +from typing import Any, Dict, List, Optional, Set, TextIO, Tuple from warnings import warn try: @@ -160,10 +160,10 @@ def __init__( spark_session: "pyspark.sql.SparkSession", base_df: "pyspark.sql.DataFrame", compare_df: "pyspark.sql.DataFrame", - join_columns: List[Union[str, Tuple[str, str]]], - column_mapping: Optional[List[Tuple[str, str]]] = None, + join_columns: List[str | Tuple[str, str]], + column_mapping: List[Tuple[str, str]] | None = None, cache_intermediates: bool = False, - known_differences: Optional[List[Dict[str, Any]]] = None, + known_differences: List[Dict[str, Any]] | None = None, rel_tol: float = 0, abs_tol: float = 0, show_all_columns: bool = False, @@ -198,14 +198,14 @@ def __init__( self.spark = spark_session self.base_unq_rows = self.compare_unq_rows = None - self._base_row_count: Optional[int] = None - self._compare_row_count: Optional[int] = None - self._common_row_count: Optional[int] = None - self._joined_dataframe: Optional[pyspark.sql.DataFrame] = None - self._rows_only_base: Optional[pyspark.sql.DataFrame] = None - self._rows_only_compare: Optional[pyspark.sql.DataFrame] = None - self._all_matched_rows: Optional[pyspark.sql.DataFrame] = None - self._all_rows_mismatched: Optional[pyspark.sql.DataFrame] = None + self._base_row_count: int | None = None + self._compare_row_count: int | None = None + self._common_row_count: int | None = None + self._joined_dataframe: pyspark.sql.DataFrame | None = None + self._rows_only_base: pyspark.sql.DataFrame | None = None + self._rows_only_compare: pyspark.sql.DataFrame | None = None + self._all_matched_rows: pyspark.sql.DataFrame | None = None + self._all_rows_mismatched: pyspark.sql.DataFrame | None = None self.columns_match_dict: Dict[str, Any] = {} # drop the duplicates before actual comparison made. @@ -219,7 +219,7 @@ def __init__( self._compare_row_count = self.compare_df.count() def _tuplizer( - self, input_list: List[Union[str, Tuple[str, str]]] + self, input_list: List[str | Tuple[str, str]] ) -> List[Tuple[str, str]]: join_columns: List[Tuple[str, str]] = [] for val in input_list: diff --git a/datacompy/spark/pandas.py b/datacompy/spark/pandas.py index 4ce48aad..0886dd18 100644 --- a/datacompy/spark/pandas.py +++ b/datacompy/spark/pandas.py @@ -23,7 +23,7 @@ import logging import os -from typing import List, Optional, Union +from typing import List from warnings import warn import pandas as pd @@ -94,7 +94,7 @@ def __init__( self, df1: "ps.DataFrame", df2: "ps.DataFrame", - join_columns: Union[List[str], str], + join_columns: List[str] | str, abs_tol: float = 0, rel_tol: float = 0, df1_name: str = "df1", @@ -110,7 +110,7 @@ def __init__( ps.set_option("compute.ops_on_diff_frames", True) self.cast_column_names_lower = cast_column_names_lower - if isinstance(join_columns, (str, int, float)): + if isinstance(join_columns, str | int | float): self.join_columns = [ ( str(join_columns).lower() @@ -638,7 +638,7 @@ def report( self, sample_count: int = 10, column_count: int = 10, - html_file: Optional[str] = None, + html_file: str | None = None, ) -> str: """Return a string representation of a report. @@ -793,7 +793,7 @@ def report( return report -def render(filename: str, *fields: Union[int, float, str]) -> str: +def render(filename: str, *fields: int | float | str) -> str: """Render out an individual template. This basically just reads in a diff --git a/datacompy/spark/sql.py b/datacompy/spark/sql.py index 050c5ba5..63e8cb26 100644 --- a/datacompy/spark/sql.py +++ b/datacompy/spark/sql.py @@ -23,7 +23,7 @@ import os from copy import deepcopy -from typing import List, Optional, Tuple, Union +from typing import List, Tuple import pandas as pd from ordered_set import OrderedSet @@ -132,7 +132,7 @@ def __init__( spark_session: "pyspark.sql.SparkSession", df1: "pyspark.sql.DataFrame", df2: "pyspark.sql.DataFrame", - join_columns: Union[List[str], str], + join_columns: List[str] | str, abs_tol: float = 0, rel_tol: float = 0, df1_name: str = "df1", @@ -142,7 +142,7 @@ def __init__( cast_column_names_lower: bool = True, ) -> None: self.cast_column_names_lower = cast_column_names_lower - if isinstance(join_columns, (str, int, float)): + if isinstance(join_columns, str | int | float): self.join_columns = [ ( str(join_columns).lower() @@ -721,7 +721,7 @@ def report( self, sample_count: int = 10, column_count: int = 10, - html_file: Optional[str] = None, + html_file: str | None = None, ) -> str: """Return a string representation of a report. @@ -886,7 +886,7 @@ def report( return report -def render(filename: str, *fields: Union[int, float, str]) -> str: +def render(filename: str, *fields: int | float | str) -> str: """Render out an individual template. This basically just reads in a