Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance tests for DataCatalog #4230

Merged
merged 21 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
a92b911
Update index.md (#4221)
Vaslo Oct 10, 2024
e863f16
Bump kedro-sphinx-theme from 2024.4.0 to 2024.10.0 (#4216)
dependabot[bot] Oct 10, 2024
2ccba38
Replace all instances of "data set" with "dataset" (#4211)
deepyaman Oct 10, 2024
e071640
Manually created sitemap.xml for improved control over indexed docs p…
DimedS Oct 10, 2024
8cb24c4
Bump up version to 0.19.9 (#4219)
lrcouto Oct 10, 2024
3b2878e
first pass doesn't work yet
ankatiyar Oct 11, 2024
f2f1770
Update ocl tests
ankatiyar Oct 14, 2024
7618ac5
revert some changes
ankatiyar Oct 14, 2024
95628a3
Update to use larger config
ankatiyar Oct 14, 2024
b5b9bf5
Update functions and docstrings
ankatiyar Oct 15, 2024
9737847
Add performance tests for DataCatalog
ankatiyar Oct 15, 2024
3d1cad4
Update mypy ignore messages (#4228)
ankatiyar Oct 15, 2024
9fc6711
Revise Kedro project structure docs (#4208)
DimedS Oct 15, 2024
b6587e0
Update CLI autocompletion docs with new Click syntax (#4213)
hyew0nChoi Oct 15, 2024
062aba3
Bump import-linter from 2.0 to 2.1 (#4226)
dependabot[bot] Oct 15, 2024
56aefae
Performance test for `OmegaConfigLoader` (#4225)
ankatiyar Oct 16, 2024
f981b9b
Add a test for init and fix indent
ankatiyar Oct 16, 2024
b8e4203
Revert "Add a test for init and fix indent"
ankatiyar Oct 16, 2024
821401d
Add a test for init and fix indent
ankatiyar Oct 16, 2024
f0356c4
Merge branch 'main' into performance-datacatalog
ankatiyar Oct 17, 2024
f3a5925
Merge branch 'main' into performance-datacatalog
ankatiyar Oct 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,11 @@
"environment_type": "virtualenv",
"show_commit_url": "http://github.com/kedro-org/kedro/commit/",
"results_dir": ".asv/results",
"html_dir": ".asv/html"
"html_dir": ".asv/html",
"matrix": {
"req": {
"kedro-datasets": [],
"pandas": []
}
}
}
82 changes: 82 additions & 0 deletions benchmarks/benchmark_datacatalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import pandas as pd
from kedro_datasets.pandas import CSVDataset

from kedro.io import DataCatalog

base_catalog = {
f"dataset_{i}": {
"type": "pandas.CSVDataset",
"filepath": f"data_{i}.csv",
} for i in range(1, 1001)
}
# Add datasets with the same filepath for loading
base_catalog.update({
f"dataset_load_{i}": {
"type": "pandas.CSVDataset",
"filepath": "data.csv",
} for i in range(1, 1001)
})
# Add a factory pattern
base_catalog.update({
"dataset_factory_{placeholder}": {
"type": "pandas.CSVDataset",
"filepath": "data_{placeholder}.csv",
}
})

class TimeDataCatalog:
def setup(self):
self.catalog = DataCatalog.from_config(base_catalog)
self.dataframe = pd.DataFrame({"column": [1, 2, 3]})
self.dataframe.to_csv("data.csv", index=False)
self.datasets = {
f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001)
}
self.feed_dict = {
f"param_{i}": i for i in range(1, 1001)
}

def time_init(self):
"""Benchmark the time to initialize the catalog"""
DataCatalog.from_config(base_catalog)

def time_save(self):
"""Benchmark the time to save datasets"""
for i in range(1,1001):
self.catalog.save(f"dataset_{i}", self.dataframe)

def time_load(self):
"""Benchmark the time to load datasets"""
for i in range(1,1001):
self.catalog.load(f"dataset_load_{i}")

def time_exists(self):
"""Benchmark the time to check if datasets exist"""
for i in range(1,1001):
self.catalog.exists(f"dataset_{i}")

def time_release(self):
"""Benchmark the time to release datasets"""
for i in range(1,1001):
self.catalog.release(f"dataset_{i}")

def time_add_all(self):
"""Benchmark the time to add all datasets"""
self.catalog.add_all(self.datasets)

def time_feed_dict(self):
"""Benchmark the time to add feed dict"""
self.catalog.add_feed_dict(self.feed_dict)

def time_list(self):
"""Benchmark the time to list all datasets"""
self.catalog.list()

def time_shallow_copy(self):
"""Benchmark the time to shallow copy the catalog"""
self.catalog.shallow_copy()

def time_resolve_factory(self):
"""Benchmark the time to resolve factory"""
for i in range(1,1001):
self.catalog._get_dataset(f"dataset_factory_{i}")
12 changes: 6 additions & 6 deletions benchmarks/benchmark_ocl.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def _generate_globals(start_range, end_range, is_local=False):
return globals_dict

def _create_config_file(conf_source, env, file_name, data):
env_path = conf_source / env
env_path.mkdir(parents=True, exist_ok=True)
file_path = env_path / file_name
env_path = conf_source / env
env_path.mkdir(parents=True, exist_ok=True)
file_path = env_path / file_name

import yaml
with open(file_path, "w") as f:
yaml.dump(data, f)
import yaml
with open(file_path, "w") as f:
yaml.dump(data, f)

base_catalog = _generate_catalog(1, 1000, is_versioned=True)
local_catalog = _generate_catalog(501, 1500, is_local=True)
Expand Down