From 2786c5b4fc17d0c2ba91cb8e03a27ebf4b746e99 Mon Sep 17 00:00:00 2001 From: Daniel Sola Date: Tue, 5 Nov 2024 15:07:05 -0800 Subject: [PATCH 1/3] wip --- plugins/flytekit-auto-cache/README.md | 9 +++++ .../flytekitplugins/auto_cache/__init__.py | 10 +++++ plugins/flytekit-auto-cache/setup.py | 37 +++++++++++++++++++ plugins/flytekit-auto-cache/tests/__init__.py | 0 .../tests/test_auto_cache.py | 0 5 files changed, 56 insertions(+) create mode 100644 plugins/flytekit-auto-cache/README.md create mode 100644 plugins/flytekit-auto-cache/flytekitplugins/auto_cache/__init__.py create mode 100644 plugins/flytekit-auto-cache/setup.py create mode 100644 plugins/flytekit-auto-cache/tests/__init__.py create mode 100644 plugins/flytekit-auto-cache/tests/test_auto_cache.py diff --git a/plugins/flytekit-auto-cache/README.md b/plugins/flytekit-auto-cache/README.md new file mode 100644 index 0000000000..76d0e1f853 --- /dev/null +++ b/plugins/flytekit-auto-cache/README.md @@ -0,0 +1,9 @@ +# Flytekit Auto Cache Plugin + + + +To install the plugin, run the following command: + +```bash +pip install flytekitplugins-auto-cache +``` diff --git a/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/__init__.py b/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/__init__.py new file mode 100644 index 0000000000..54d65b9e43 --- /dev/null +++ b/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/__init__.py @@ -0,0 +1,10 @@ +""" +.. currentmodule:: flytekitplugins.auto_cache + +This package contains things that are useful when extending Flytekit. + +.. autosummary:: + :template: custom.rst + :toctree: generated/ + +""" diff --git a/plugins/flytekit-auto-cache/setup.py b/plugins/flytekit-auto-cache/setup.py new file mode 100644 index 0000000000..6ba1d5060f --- /dev/null +++ b/plugins/flytekit-auto-cache/setup.py @@ -0,0 +1,37 @@ +from setuptools import setup + +PLUGIN_NAME = "auto_cache" + +microlib_name = "flytekitplugins-auto-cache" + +plugin_requires = ["flytekit"] + +__version__ = "0.0.0+develop" + +setup( + name=microlib_name, + version=__version__, + author="flyteorg", + author_email="admin@flyte.org", + description="This package holds the auto cache plugins for flytekit", + namespace_packages=["flytekitplugins"], + packages=[f"flytekitplugins.{PLUGIN_NAME}"], + install_requires=plugin_requires, + license="apache2", + python_requires=">=3.8", + classifiers=[ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + ], + entry_points={"flytekit.plugins": [f"{PLUGIN_NAME}=flytekitplugins.{PLUGIN_NAME}"]}, +) diff --git a/plugins/flytekit-auto-cache/tests/__init__.py b/plugins/flytekit-auto-cache/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/plugins/flytekit-auto-cache/tests/test_auto_cache.py b/plugins/flytekit-auto-cache/tests/test_auto_cache.py new file mode 100644 index 0000000000..e69de29bb2 From b18bac7f112fc567225476d964ef19d76129706d Mon Sep 17 00:00:00 2001 From: Daniel Sola Date: Wed, 6 Nov 2024 15:41:58 -0800 Subject: [PATCH 2/3] initial auto cache method Signed-off-by: Daniel Sola --- flytekit/core/auto_cache.py | 37 +++++++++ flytekit/core/task.py | 17 ++-- .../flytekitplugins/auto_cache/__init__.py | 3 + .../auto_cache/cache_function_body.py | 53 ++++++++++++ .../tests/test_auto_cache.py | 80 +++++++++++++++++++ 5 files changed, 185 insertions(+), 5 deletions(-) create mode 100644 flytekit/core/auto_cache.py create mode 100644 plugins/flytekit-auto-cache/flytekitplugins/auto_cache/cache_function_body.py diff --git a/flytekit/core/auto_cache.py b/flytekit/core/auto_cache.py new file mode 100644 index 0000000000..adf4741a0b --- /dev/null +++ b/flytekit/core/auto_cache.py @@ -0,0 +1,37 @@ +from typing import Any, Callable, Protocol, runtime_checkable + + +@runtime_checkable +class AutoCache(Protocol): + """ + A protocol that defines the interface for a caching mechanism + that generates a version hash of a function based on its source code. + + Attributes: + salt (str): A string used to add uniqueness to the generated hash. Default is "salt". + + Methods: + get_version(func: Callable[..., Any]) -> str: + Given a function, generates a version hash based on its source code and the salt. + """ + + def __init__(self, salt: str = "salt") -> None: + """ + Initialize the AutoCache instance with a salt value. + + Args: + salt (str): A string to be used as the salt in the hashing process. Defaults to "salt". + """ + self.salt = salt + + def get_version(self, func: Callable[..., Any]) -> str: + """ + Generate a version hash for the provided function. + + Args: + func (Callable[..., Any]): A callable function whose version hash needs to be generated. + + Returns: + str: The SHA-256 hash of the function's source code combined with the salt. + """ + ... diff --git a/flytekit/core/task.py b/flytekit/core/task.py index 745f452a83..7519f341d6 100644 --- a/flytekit/core/task.py +++ b/flytekit/core/task.py @@ -5,6 +5,7 @@ from functools import update_wrapper from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union, overload +from flytekit.core.auto_cache import AutoCache from flytekit.core.utils import str2bool try: @@ -99,7 +100,7 @@ def find_pythontask_plugin(cls, plugin_config_type: type) -> Type[PythonFunction def task( _task_function: None = ..., task_config: Optional[T] = ..., - cache: bool = ..., + cache: Union[bool, list[AutoCache]] = ..., cache_serialize: bool = ..., cache_version: str = ..., cache_ignore_input_vars: Tuple[str, ...] = ..., @@ -137,7 +138,7 @@ def task( def task( _task_function: Callable[P, FuncOut], task_config: Optional[T] = ..., - cache: bool = ..., + cache: Union[bool, list[AutoCache]] = ..., cache_serialize: bool = ..., cache_version: str = ..., cache_ignore_input_vars: Tuple[str, ...] = ..., @@ -174,7 +175,7 @@ def task( def task( _task_function: Optional[Callable[P, FuncOut]] = None, task_config: Optional[T] = None, - cache: bool = False, + cache: Union[bool, list[AutoCache]] = False, cache_serialize: bool = False, cache_version: str = "", cache_ignore_input_vars: Tuple[str, ...] = (), @@ -248,7 +249,7 @@ def my_task(x: int, y: typing.Dict[str, str]) -> str: :param _task_function: This argument is implicitly passed and represents the decorated function :param task_config: This argument provides configuration for a specific task types. Please refer to the plugins documentation for the right object to use. - :param cache: Boolean that indicates if caching should be enabled + :param cache: Boolean that indicates if caching should be enabled or a list of AutoCache implementations :param cache_serialize: Boolean that indicates if identical (ie. same inputs) instances of this task should be executed in serial when caching is enabled. This means that given multiple concurrent executions over identical inputs, only a single instance executes and the rest wait to reuse the cached results. This @@ -343,10 +344,16 @@ def launch_dynamically(): """ def wrapper(fn: Callable[P, Any]) -> PythonFunctionTask[T]: + if isinstance(cache, list) and all(isinstance(item, AutoCache) for item in cache): + cache_versions = [item.get_version() for item in cache] + task_hash = "".join(cache_versions) + else: + task_hash = "" + _metadata = TaskMetadata( cache=cache, cache_serialize=cache_serialize, - cache_version=cache_version, + cache_version=cache_version if not task_hash else task_hash, cache_ignore_input_vars=cache_ignore_input_vars, retries=retries, interruptible=interruptible, diff --git a/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/__init__.py b/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/__init__.py index 54d65b9e43..5872eda7ee 100644 --- a/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/__init__.py +++ b/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/__init__.py @@ -7,4 +7,7 @@ :template: custom.rst :toctree: generated/ + CacheFunctionBody """ + +from .cache_function_body import CacheFunctionBody diff --git a/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/cache_function_body.py b/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/cache_function_body.py new file mode 100644 index 0000000000..2e4472109d --- /dev/null +++ b/plugins/flytekit-auto-cache/flytekitplugins/auto_cache/cache_function_body.py @@ -0,0 +1,53 @@ +import ast +import hashlib +import inspect +from typing import Any, Callable + + +class CacheFunctionBody: + """ + A class that implements a versioning mechanism for functions by generating + a SHA-256 hash of the function's source code combined with a salt. + + Attributes: + salt (str): A string used to add uniqueness to the generated hash. Default is "salt". + + Methods: + get_version(func: Callable[..., Any]) -> str: + Given a function, generates a version hash based on its source code and the salt. + """ + + def __init__(self, salt: str = "salt") -> None: + """ + Initialize the CacheFunctionBody instance with a salt value. + + Args: + salt (str): A string to be used as the salt in the hashing process. Defaults to "salt". + """ + self.salt = salt + + def get_version(self, func: Callable[..., Any]) -> str: + """ + Generate a version hash for the provided function by parsing its source code + and adding a salt before applying the SHA-256 hash function. + + Args: + func (Callable[..., Any]): A callable function whose version hash needs to be generated. + + Returns: + str: The SHA-256 hash of the function's source code combined with the salt. + """ + # Get the source code of the function + source = inspect.getsource(func) + + # Parse the source code into an Abstract Syntax Tree (AST) + parsed_ast = ast.parse(source) + + # Convert the AST into a string representation (dump it) + ast_bytes = ast.dump(parsed_ast).encode("utf-8") + + # Combine the AST bytes with the salt (encoded into bytes) + combined_data = ast_bytes + self.salt.encode("utf-8") + + # Return the SHA-256 hash of the combined data (AST + salt) + return hashlib.sha256(combined_data).hexdigest() diff --git a/plugins/flytekit-auto-cache/tests/test_auto_cache.py b/plugins/flytekit-auto-cache/tests/test_auto_cache.py index e69de29bb2..bd65b5461a 100644 --- a/plugins/flytekit-auto-cache/tests/test_auto_cache.py +++ b/plugins/flytekit-auto-cache/tests/test_auto_cache.py @@ -0,0 +1,80 @@ +from flytekitplugins.auto_cache import CacheFunctionBody + + +# Dummy functions +def dummy_function(x: int, y: int) -> int: + result = x + y + return result + +def dummy_function_modified(x: int, y: int) -> int: + result = x * y + return result + + +def dummy_function_with_comments_and_formatting(x: int, y: int) -> int: + # Adding a new line here + result = ( + x + y + ) + # Another new line + return result + + + +def test_get_version_with_same_function_and_salt(): + """ + Test that calling get_version with the same function and salt returns the same hash. + """ + cache1 = CacheFunctionBody(salt="salt") + cache2 = CacheFunctionBody(salt="salt") + + # Both calls should return the same hash since the function and salt are the same + version1 = cache1.get_version(dummy_function) + version2 = cache2.get_version(dummy_function) + + assert version1 == version2, f"Expected {version1}, but got {version2}" + + +def test_get_version_with_different_salt(): + """ + Test that calling get_version with different salts returns different hashes for the same function. + """ + cache1 = CacheFunctionBody(salt="salt1") + cache2 = CacheFunctionBody(salt="salt2") + + # The hashes should be different because the salts are different + version1 = cache1.get_version(dummy_function) + version2 = cache2.get_version(dummy_function) + + assert version1 != version2, f"Expected different hashes but got the same: {version1}" + + +def test_get_version_with_different_function_source(): + """ + Test that calling get_version with different function sources returns different hashes. + """ + cache = CacheFunctionBody(salt="salt") + + # The hash should be different because the function source has changed + version1 = cache.get_version(dummy_function) + version2 = cache.get_version(dummy_function_modified) + + assert version1 != version2, f"Expected different hashes but got the same: {version1} and {version2}" + + +def test_get_version_with_comments_and_formatting_changes(): + """ + Test that adding comments, changing formatting, or modifying the function signature + results in a different hash. + """ + # Modify the function by adding comments and changing the formatting + cache = CacheFunctionBody(salt="salt") + + # Get the hash for the original dummy function + original_version = cache.get_version(dummy_function) + + # Get the hash for the function with comments and formatting changes + version_with_comments_and_formatting = cache.get_version(dummy_function_with_comments_and_formatting) + + # Assert that the hashes are different + assert original_version != version_with_comments_and_formatting, f"Expected different hashes but got the same: {original_version} and {version_with_comments_and_formatting}" From 50552a96bab88d8052c05c5e76f8bc7c257a17dc Mon Sep 17 00:00:00 2001 From: Daniel Sola Date: Wed, 6 Nov 2024 16:24:17 -0800 Subject: [PATCH 3/3] fix tests to import dummy functions Signed-off-by: Daniel Sola --- .../tests/dummy_functions/__init__.py | 0 .../tests/dummy_functions/dummy_function.py | 3 + ...mmy_function_comments_formatting_change.py | 9 +++ .../dummy_function_logic_change.py | 3 + .../tests/test_auto_cache.py | 77 ++++++++++--------- 5 files changed, 56 insertions(+), 36 deletions(-) create mode 100644 plugins/flytekit-auto-cache/tests/dummy_functions/__init__.py create mode 100644 plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function.py create mode 100644 plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function_comments_formatting_change.py create mode 100644 plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function_logic_change.py diff --git a/plugins/flytekit-auto-cache/tests/dummy_functions/__init__.py b/plugins/flytekit-auto-cache/tests/dummy_functions/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function.py b/plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function.py new file mode 100644 index 0000000000..413311f465 --- /dev/null +++ b/plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function.py @@ -0,0 +1,3 @@ +def dummy_function(x: int, y: int) -> int: + result = x + y + return result diff --git a/plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function_comments_formatting_change.py b/plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function_comments_formatting_change.py new file mode 100644 index 0000000000..68b95eb025 --- /dev/null +++ b/plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function_comments_formatting_change.py @@ -0,0 +1,9 @@ +def dummy_function(x: int, y: int) -> int: + # Adding some comments + result = ( + x + # Adding inline comment + y # Another inline comment + ) + + # More comments + return result diff --git a/plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function_logic_change.py b/plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function_logic_change.py new file mode 100644 index 0000000000..0a63e11adf --- /dev/null +++ b/plugins/flytekit-auto-cache/tests/dummy_functions/dummy_function_logic_change.py @@ -0,0 +1,3 @@ +def dummy_function(x: int, y: int) -> int: + result = x * y + return result diff --git a/plugins/flytekit-auto-cache/tests/test_auto_cache.py b/plugins/flytekit-auto-cache/tests/test_auto_cache.py index bd65b5461a..efebe8ad0a 100644 --- a/plugins/flytekit-auto-cache/tests/test_auto_cache.py +++ b/plugins/flytekit-auto-cache/tests/test_auto_cache.py @@ -1,26 +1,9 @@ +from dummy_functions.dummy_function import dummy_function +from dummy_functions.dummy_function_comments_formatting_change import dummy_function as dummy_function_comments_formatting_change +from dummy_functions.dummy_function_logic_change import dummy_function as dummy_function_logic_change from flytekitplugins.auto_cache import CacheFunctionBody -# Dummy functions -def dummy_function(x: int, y: int) -> int: - result = x + y - return result - -def dummy_function_modified(x: int, y: int) -> int: - result = x * y - return result - - -def dummy_function_with_comments_and_formatting(x: int, y: int) -> int: - # Adding a new line here - result = ( - x + y - ) - # Another new line - return result - - - def test_get_version_with_same_function_and_salt(): """ Test that calling get_version with the same function and salt returns the same hash. @@ -49,32 +32,54 @@ def test_get_version_with_different_salt(): assert version1 != version2, f"Expected different hashes but got the same: {version1}" -def test_get_version_with_different_function_source(): + +def test_get_version_with_different_logic(): """ - Test that calling get_version with different function sources returns different hashes. + Test that functions with the same name but different logic produce different hashes. """ cache = CacheFunctionBody(salt="salt") - - # The hash should be different because the function source has changed version1 = cache.get_version(dummy_function) - version2 = cache.get_version(dummy_function_modified) + version2 = cache.get_version(dummy_function_logic_change) - assert version1 != version2, f"Expected different hashes but got the same: {version1} and {version2}" + assert version1 != version2, ( + f"Hashes should be different for functions with same name but different logic. " + f"Got {version1} and {version2}" + ) +# Test functions with different names but same logic +def function_one(x: int, y: int) -> int: + result = x + y + return result -def test_get_version_with_comments_and_formatting_changes(): +def function_two(x: int, y: int) -> int: + result = x + y + return result + +def test_get_version_with_different_function_names(): """ - Test that adding comments, changing formatting, or modifying the function signature - results in a different hash. + Test that functions with different names but same logic produce different hashes. """ - # Modify the function by adding comments and changing the formatting cache = CacheFunctionBody(salt="salt") - # Get the hash for the original dummy function - original_version = cache.get_version(dummy_function) + version1 = cache.get_version(function_one) + version2 = cache.get_version(function_two) - # Get the hash for the function with comments and formatting changes - version_with_comments_and_formatting = cache.get_version(dummy_function_with_comments_and_formatting) + assert version1 != version2, ( + f"Hashes should be different for functions with different names. " + f"Got {version1} and {version2}" + ) - # Assert that the hashes are different - assert original_version != version_with_comments_and_formatting, f"Expected different hashes but got the same: {original_version} and {version_with_comments_and_formatting}" +def test_get_version_with_formatting_changes(): + """ + Test that changing formatting and comments but keeping the same function name + results in the same hash. + """ + + cache = CacheFunctionBody(salt="salt") + version1 = cache.get_version(dummy_function) + version2 = cache.get_version(dummy_function_comments_formatting_change) + + assert version1 == version2, ( + f"Hashes should be the same for functions with same name but different formatting. " + f"Got {version1} and {version2}" + )