From fa836bb2dfa41dc39360449aba7db92245a2420c Mon Sep 17 00:00:00 2001 From: Noah Holm <32292420+noppaz@users.noreply.github.com> Date: Mon, 10 Apr 2023 15:52:31 +0200 Subject: [PATCH] add maximum_seed_size_mib to global config --- core/dbt/cli/main.py | 1 + core/dbt/cli/params.py | 10 +++++++++- core/dbt/constants.py | 11 ----------- core/dbt/contracts/project.py | 1 + core/dbt/events/types.py | 18 +++++++++++------- core/dbt/flags.py | 1 + core/dbt/parser/read_files.py | 15 ++++++++++++--- test/unit/test_graph_selector_methods.py | 4 ++-- .../defer_state/test_modified_state.py | 2 +- 9 files changed, 38 insertions(+), 25 deletions(-) diff --git a/core/dbt/cli/main.py b/core/dbt/cli/main.py index 4651cdbb1a9..259d679b3e2 100644 --- a/core/dbt/cli/main.py +++ b/core/dbt/cli/main.py @@ -93,6 +93,7 @@ def invoke(self, args: List[str]) -> Tuple[Optional[List], bool]: @p.warn_error @p.warn_error_options @p.write_json +@p.maximum_seed_size_mib def cli(ctx, **kwargs): """An ELT tool for managing your SQL transformations and data models. For more documentation on these commands, visit: docs.getdbt.com diff --git a/core/dbt/cli/params.py b/core/dbt/cli/params.py index ec1ab82dd62..463f2449971 100644 --- a/core/dbt/cli/params.py +++ b/core/dbt/cli/params.py @@ -462,7 +462,7 @@ def _version_callback(ctx, _param, value): envvar="DBT_WARN_ERROR_OPTIONS", default="{}", help="""If dbt would normally warn, instead raise an exception based on include/exclude configuration. Examples include --select that selects nothing, deprecations, configurations with no associated models, invalid test configurations, - and missing sources/refs in tests. This argument should be a YAML string, with keys 'include' or 'exclude'. eg. '{"include": "all", "exclude": ["NoNodesForSelectionCriteria"]}'""", + and missing sources/refs in tests. This argument should be a JSON string, with keys 'include' or 'exclude'. eg. '{"include": "all", "exclude": ["NoNodesForSelectionCriteria"]}'""", type=WarnErrorOptionsType(), ) @@ -479,3 +479,11 @@ def _version_callback(ctx, _param, value): help="TODO: No help text currently available", default=True, ) + +maximum_seed_size_mib = click.option( + "--maximum-seed-size-mib", + envvar="DBT_MAXIMUM_SEED_SIZE_MIB", + help="Specify max size (MiB) for seed files that will be hashed for state comparison.", + type=click.INT, + default=1, +) diff --git a/core/dbt/constants.py b/core/dbt/constants.py index ddc1cb9cb4f..355334b1ec8 100644 --- a/core/dbt/constants.py +++ b/core/dbt/constants.py @@ -1,19 +1,8 @@ -import os - SECRET_ENV_PREFIX = "DBT_ENV_SECRET_" DEFAULT_ENV_PLACEHOLDER = "DBT_DEFAULT_PLACEHOLDER" METADATA_ENV_PREFIX = "DBT_ENV_CUSTOM_ENV_" -def get_max_seed_size(): - mx = os.getenv("DBT_MAXIMUM_SEED_SIZE", "1") - return int(mx) - - -DEFAULT_MAXIMUM_SEED_SIZE = 1 * 1024 * 1024 -MAXIMUM_SEED_SIZE = get_max_seed_size() * DEFAULT_MAXIMUM_SEED_SIZE -MAXIMUM_SEED_SIZE_NAME = str(get_max_seed_size()) + "MiB" - PIN_PACKAGE_URL = ( "https://docs.getdbt.com/docs/package-management#section-specifying-package-versions" ) diff --git a/core/dbt/contracts/project.py b/core/dbt/contracts/project.py index 3041b1bd4b9..38fb8949f73 100644 --- a/core/dbt/contracts/project.py +++ b/core/dbt/contracts/project.py @@ -262,6 +262,7 @@ class UserConfig(ExtensibleDbtClassMixin, Replaceable, UserConfigContract): static_parser: Optional[bool] = None indirect_selection: Optional[str] = None cache_selected_only: Optional[bool] = None + maximum_seed_size_mib: Optional[int] = None @dataclass diff --git a/core/dbt/events/types.py b/core/dbt/events/types.py index 0bb16966f1d..891a84fd064 100644 --- a/core/dbt/events/types.py +++ b/core/dbt/events/types.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from dbt.ui import line_wrap_message, warning_tag, red, green, yellow -from dbt.constants import MAXIMUM_SEED_SIZE_NAME, PIN_PACKAGE_URL +from dbt.constants import PIN_PACKAGE_URL from dbt.events.base_types import ( DynamicLevel, NoFile, @@ -19,7 +19,7 @@ from dbt.events.proto_types import EventInfo, RunResultMsg, ListOfStrings # noqa from dbt.events.proto_types import NodeInfo, ReferenceKeyMsg, TimingInfoMsg # noqa from dbt.events import proto_types as pt - +from dbt.flags import get_flags from dbt.node_types import NodeType @@ -54,6 +54,10 @@ def format_adapter_message(name, base_msg, args) -> str: return f"{name} adapter: {msg}" +def get_maximum_seed_size_name() -> str: + return str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB" + + # ======================================================= # A - Pre-project loading # ======================================================= @@ -972,8 +976,8 @@ def code(self): def message(self) -> str: msg = ( f"Found a seed ({self.package_name}.{self.name}) " - f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was " - f"<={MAXIMUM_SEED_SIZE_NAME}, so it has changed" + f">{get_maximum_seed_size_name()} in size. The previous file was " + f"<={get_maximum_seed_size_name()}, so it has changed" ) return msg @@ -986,7 +990,7 @@ def code(self): def message(self) -> str: msg = ( f"Found a seed ({self.package_name}.{self.name}) " - f">{MAXIMUM_SEED_SIZE_NAME} in size at the same path, dbt " + f">{get_maximum_seed_size_name()} in size at the same path, dbt " f"cannot tell if it has changed: assuming they are the same" ) return msg @@ -1000,7 +1004,7 @@ def code(self): def message(self) -> str: msg = ( f"Found a seed ({self.package_name}.{self.name}) " - f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was in " + f">{get_maximum_seed_size_name()} in size. The previous file was in " f"a different location, assuming it has changed" ) return msg @@ -1014,7 +1018,7 @@ def code(self): def message(self) -> str: msg = ( f"Found a seed ({self.package_name}.{self.name}) " - f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file had a " + f">{get_maximum_seed_size_name()} in size. The previous file had a " f"checksum type of {self.checksum_name}, so it has changed" ) return msg diff --git a/core/dbt/flags.py b/core/dbt/flags.py index 5d3d7032c9c..ea4f4e310f8 100644 --- a/core/dbt/flags.py +++ b/core/dbt/flags.py @@ -85,6 +85,7 @@ def get_flag_dict(): "cache_selected_only", "target_path", "log_path", + "maximum_seed_size_mib", } return {key: getattr(GLOBAL_FLAGS, key.upper(), None) for key in flag_attr} diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index e845d218476..5df97e68f23 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -19,6 +19,7 @@ from typing import Optional, Dict, List, Mapping from dbt.events.types import InputFileDiffError from dbt.events.functions import fire_event +from dbt.flags import get_flags @dataclass @@ -36,7 +37,14 @@ class FileDiff(dbtClassMixin): changed: List[InputFile] added: List[InputFile] -from dbt.constants import MAXIMUM_SEED_SIZE, DEFAULT_MAXIMUM_SEED_SIZE + +DEFAULT_MAXIMUM_SEED_SIZE = 1 * 1024 * 1024 + + +def get_max_seed_size() -> int: + """The maximum seed size (MiB) that will be hashed for state comparison.""" + flags = get_flags() + return flags.MAXIMUM_SEED_SIZE_MIB * DEFAULT_MAXIMUM_SEED_SIZE # This loads the files contents and creates the SourceFile object @@ -116,8 +124,9 @@ def validate_yaml(file_path, dct): # Special processing for big seed files def load_seed_source_file(match: FilePath, project_name) -> SourceFile: - # MAXIMUM_SEED_SIZE = 0 means no limit - if match.file_size() > MAXIMUM_SEED_SIZE and MAXIMUM_SEED_SIZE != 0: + maximum_seed_size = get_max_seed_size() + # maximum_seed_size = 0 means no limit + if match.file_size() > maximum_seed_size and maximum_seed_size != 0: # We don't want to calculate a hash of this file. Use the path. source_file = SourceFile.big_seed(match) elif match.file_size() <= DEFAULT_MAXIMUM_SEED_SIZE: diff --git a/test/unit/test_graph_selector_methods.py b/test/unit/test_graph_selector_methods.py index 628d823da5d..3ccf644173e 100644 --- a/test/unit/test_graph_selector_methods.py +++ b/test/unit/test_graph_selector_methods.py @@ -1036,7 +1036,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat event = warn_or_error_patch.call_args[0][0] assert type(event).__name__ == 'SeedExceedsLimitSamePath' msg = event.message() - assert msg.startswith('Found a seed (pkg.seed) >1MB in size') + assert msg.startswith('Found a seed (pkg.seed) >1MiB in size') with mock.patch('dbt.contracts.graph.nodes.warn_or_error') as warn_or_error_patch: assert not search_manifest_using_method(manifest, method, 'new') warn_or_error_patch.assert_not_called() @@ -1053,7 +1053,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state event = warn_or_error_patch.call_args[0][0] assert type(event).__name__ == 'SeedIncreased' msg = event.message() - assert msg.startswith('Found a seed (pkg.seed) >1MB in size') + assert msg.startswith('Found a seed (pkg.seed) >1MiB in size') with mock.patch('dbt.contracts.graph.nodes.warn_or_error') as warn_or_error_patch: assert not search_manifest_using_method(manifest, method, 'new') warn_or_error_patch.assert_not_called() diff --git a/tests/functional/defer_state/test_modified_state.py b/tests/functional/defer_state/test_modified_state.py index 80e3d455da1..0216e329153 100644 --- a/tests/functional/defer_state/test_modified_state.py +++ b/tests/functional/defer_state/test_modified_state.py @@ -137,7 +137,7 @@ def test_changed_seed_contents_state(self, project): "./state", ] ) - assert ">1MB" in str(exc.value) + assert ">1MiB" in str(exc.value) shutil.rmtree("./state") self.copy_state()