diff --git a/CHANGES.txt b/CHANGES.txt
index 88eab272..b7776dff 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -2,3 +2,8 @@ v0.1.0, 2020-04-23 -- Initial release.
v0.2.0, 2020-05-05 -- Change interface of TrackingDataSerializer
Add Metrica Tracking Serializer including automated tests
Cleanup some import statements
+v0.2.1, 2020-05-12 -- Add some helpers functions to directly load a dataset by filenames
+v0.3.0, 2020-05-15 -- Add FIFA EPTS Tracking data loader
+ Add some examples
+ Add datasets loader to directly load dataset from your python code
+ Add limit argument to all loaders
diff --git a/README.md b/README.md
index 000539e0..a63e58b5 100644
--- a/README.md
+++ b/README.md
@@ -13,11 +13,13 @@ different tracking- and event data like a breeze. It aims to be the fundamental
## Main Features
Here are just a few of the things that kloppy does well:
+- Directly load [**Public datasets**](#datasets) to get started right away.
- Understandable [**Standardized models**](#models) for tracking- and event datasets
- Out-of-the-box [**(De)serializing**](#serializing) tracking- and event data from different source into standardized models and visa-versa
- Flexible [**pitch dimensions**](#pitch-dimensions) transformer for changing a dataset pitch dimensions from one to another (eg OPTA's 100x100 -> TRACAB meters)
- Intelligent [**orientation**](#orientation) transforming orientation of a dataset (eg from TRACAB fixed orientation to "Home Team" orientation)
+
## Where to get it
The source code is currently hosted on GitHub at:
https://github.com/PySport/kloppy
@@ -50,7 +52,15 @@ data_set = load_epts_tracking_data('meta.xml', 'raw_data.txt')
data_set = transform(data_set, pitch_dimensions=[[0, 108], [-34, 34]])
pandas_data_frame = to_pandas(data_set)
+```
+
+### Public datasets / Very quick start
+More and more companies are publishing (demo) datasets to get you started. Inspired by the `tensorflow_datasets` package,
+we added a "dataset loader" which does all the heavy lifting for you: find urls, download files, organize and load them.
+```python
+from kloppy import datasets
+data_set = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10})
```
### Standardized models
diff --git a/examples/datasets/metrica.py b/examples/datasets/metrica.py
new file mode 100644
index 00000000..d0322a6b
--- /dev/null
+++ b/examples/datasets/metrica.py
@@ -0,0 +1,22 @@
+from kloppy import datasets, to_pandas
+
+
+def main():
+ """
+ This example shows the use of Metrica datasets, and how we can pass argument
+ to the dataset loader.
+ """
+
+ # The metrica dataset loader loads by default the 'game1' dataset
+ data_set = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10})
+ print(len(data_set.frames))
+
+ # We can pass additional keyword arguments to the loaders to specify a different dataset
+ data_set = datasets.load("metrica_tracking", options={'limit': 1000}, game='game2')
+
+ data_frame = to_pandas(data_set)
+ print(data_frame)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/kloppy/__init__.py b/kloppy/__init__.py
index de58587a..f7ea039b 100644
--- a/kloppy/__init__.py
+++ b/kloppy/__init__.py
@@ -1,2 +1,3 @@
from .infra.serializers import *
from .helpers import *
+from .infra import datasets
diff --git a/kloppy/helpers.py b/kloppy/helpers.py
index 6007936d..a48176f6 100644
--- a/kloppy/helpers.py
+++ b/kloppy/helpers.py
@@ -103,3 +103,12 @@ def to_pandas(data_set: DataSet, _record_converter: Callable = None) -> 'DataFra
return pd.DataFrame.from_records(
map(_record_converter, data_set.records)
)
+
+
+__all__ = [
+ 'load_tracab_tracking_data',
+ 'load_metrica_tracking_data',
+ 'load_epts_tracking_data',
+ 'to_pandas',
+ 'transform'
+]
diff --git a/kloppy/infra/datasets/__init__.py b/kloppy/infra/datasets/__init__.py
new file mode 100644
index 00000000..370a5b63
--- /dev/null
+++ b/kloppy/infra/datasets/__init__.py
@@ -0,0 +1,8 @@
+# import for registration
+from . import tracking
+
+from .core.loading import load
+
+__all__ = [
+ 'load'
+]
diff --git a/kloppy/infra/datasets/core/__init__.py b/kloppy/infra/datasets/core/__init__.py
new file mode 100644
index 00000000..c9b1f513
--- /dev/null
+++ b/kloppy/infra/datasets/core/__init__.py
@@ -0,0 +1 @@
+from .builder import DatasetBuilder
diff --git a/kloppy/infra/datasets/core/builder.py b/kloppy/infra/datasets/core/builder.py
new file mode 100644
index 00000000..84e074ab
--- /dev/null
+++ b/kloppy/infra/datasets/core/builder.py
@@ -0,0 +1,15 @@
+from abc import abstractmethod
+from typing import Dict, Type, Union
+
+from ...serializers.tracking import TrackingDataSerializer
+from .registered import RegisteredDataset
+
+
+class DatasetBuilder(metaclass=RegisteredDataset):
+ @abstractmethod
+ def get_data_set_files(self, **kwargs) -> Dict[str, Dict[str, str]]:
+ raise NotImplementedError
+
+ @abstractmethod
+ def get_serializer_cls(self) -> Union[Type[TrackingDataSerializer]]:
+ raise NotImplementedError
diff --git a/kloppy/infra/datasets/core/loading.py b/kloppy/infra/datasets/core/loading.py
new file mode 100644
index 00000000..c0005145
--- /dev/null
+++ b/kloppy/infra/datasets/core/loading.py
@@ -0,0 +1,67 @@
+import os
+
+import requests
+
+from typing import Dict, Union
+
+from kloppy.domain import DataSet, TrackingDataSet
+
+from .registered import _DATASET_REGISTRY
+
+
+def download_file(url, local_filename):
+ with requests.get(url, stream=True) as r:
+ r.raise_for_status()
+ with open(local_filename, 'wb') as f:
+ for chunk in r.iter_content(chunk_size=8192):
+ f.write(chunk)
+
+
+def get_local_files(data_set_name: str, files: Dict[str, str]) -> Dict[str, str]:
+ datasets_base_dir = os.environ.get('KLOPPY_BASE_DIR', None)
+ if not datasets_base_dir:
+ datasets_base_dir = os.path.expanduser('~/kloppy_datasets')
+
+ dataset_base_dir = f'{datasets_base_dir}/{data_set_name}'
+ if not os.path.exists(dataset_base_dir):
+ os.makedirs(dataset_base_dir)
+
+ local_files = {}
+ for file_key, file_url in files.items():
+ filename = file_url.split('/')[-1]
+ local_filename = f'{dataset_base_dir}/{filename}'
+ if not os.path.exists(local_filename):
+ print(f'Downloading {filename}...')
+ download_file(file_url, local_filename)
+ print('Done')
+ local_files[file_key] = local_filename
+ return local_files
+
+
+def load(data_set_name: str, options=None, **dataset_kwargs) -> Union[TrackingDataSet]:
+ if data_set_name not in _DATASET_REGISTRY:
+ raise ValueError(f"Dataset {data_set_name} not found")
+
+ builder_cls = _DATASET_REGISTRY[data_set_name]
+ builder = builder_cls()
+
+ dataset_remote_files = builder.get_data_set_files(**dataset_kwargs)
+ dataset_local_files = get_local_files(data_set_name, dataset_remote_files)
+
+ file_handlers = {
+ local_file_key: open(local_file_name, 'rb')
+ for local_file_key, local_file_name
+ in dataset_local_files.items()
+ }
+
+ try:
+ serializer_cls = builder.get_serializer_cls()
+ serializer = serializer_cls()
+ data_set = serializer.deserialize(
+ inputs=file_handlers,
+ options=options
+ )
+ finally:
+ for fp in file_handlers.values():
+ fp.close()
+ return data_set
diff --git a/kloppy/infra/datasets/core/registered.py b/kloppy/infra/datasets/core/registered.py
new file mode 100644
index 00000000..35940332
--- /dev/null
+++ b/kloppy/infra/datasets/core/registered.py
@@ -0,0 +1,27 @@
+import inspect
+import re
+import abc
+from typing import Type, Dict
+
+
+_first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)")
+_all_cap_re = re.compile("([a-z0-9])([A-Z])")
+
+# from .builder import DatasetBuilder
+_DATASET_REGISTRY: Dict[str, Type['DatasetBuilder']] = {}
+
+
+def camelcase_to_snakecase(name):
+ """Convert camel-case string to snake-case."""
+ s1 = _first_cap_re.sub(r"\1_\2", name)
+ return _all_cap_re.sub(r"\1_\2", s1).lower()
+
+
+class RegisteredDataset(abc.ABCMeta):
+ def __new__(mcs, cls_name, bases, class_dict):
+ name = camelcase_to_snakecase(cls_name)
+ class_dict["name"] = name
+ builder_cls = super(RegisteredDataset, mcs).__new__(mcs, cls_name, bases, class_dict)
+ if not inspect.isabstract(builder_cls):
+ _DATASET_REGISTRY[name] = builder_cls
+ return builder_cls
diff --git a/kloppy/infra/datasets/tracking/__init__.py b/kloppy/infra/datasets/tracking/__init__.py
new file mode 100644
index 00000000..659d3352
--- /dev/null
+++ b/kloppy/infra/datasets/tracking/__init__.py
@@ -0,0 +1 @@
+from .metrica import MetricaTracking
diff --git a/kloppy/infra/datasets/tracking/metrica.py b/kloppy/infra/datasets/tracking/metrica.py
new file mode 100644
index 00000000..a296bb2c
--- /dev/null
+++ b/kloppy/infra/datasets/tracking/metrica.py
@@ -0,0 +1,25 @@
+from typing import Dict, Type
+
+from ..core.builder import DatasetBuilder
+from ...serializers.tracking import TrackingDataSerializer, MetricaTrackingSerializer
+
+
+_DATASET_URLS = {
+ 'game1': {
+ 'raw_data_home': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_1/Sample_Game_1_RawTrackingData_Home_Team.csv',
+ 'raw_data_away': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_1/Sample_Game_1_RawTrackingData_Away_Team.csv'
+ },
+ 'game2': {
+ 'raw_data_home': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_2/Sample_Game_2_RawTrackingData_Home_Team.csv',
+ 'raw_data_away': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_2/Sample_Game_2_RawTrackingData_Away_Team.csv'
+ }
+}
+
+
+class MetricaTracking(DatasetBuilder):
+ def get_data_set_files(self,**kwargs) -> Dict[str, str]:
+ game = kwargs.get('game', 'game1')
+ return _DATASET_URLS[game]
+
+ def get_serializer_cls(self) -> Type[TrackingDataSerializer]:
+ return MetricaTrackingSerializer
diff --git a/kloppy/infra/serializers/tracking/epts/reader.py b/kloppy/infra/serializers/tracking/epts/reader.py
index 2ff02ac3..5006c8bb 100644
--- a/kloppy/infra/serializers/tracking/epts/reader.py
+++ b/kloppy/infra/serializers/tracking/epts/reader.py
@@ -87,7 +87,7 @@ def _set_current_data_spec(idx):
yield row
n += 1
- if limit and n > limit:
+ if limit and n >= limit:
break
if frame_id >= end_frame_id:
diff --git a/kloppy/infra/serializers/tracking/metrica.py b/kloppy/infra/serializers/tracking/metrica.py
index 63e31a99..533a8dcd 100644
--- a/kloppy/infra/serializers/tracking/metrica.py
+++ b/kloppy/infra/serializers/tracking/metrica.py
@@ -189,7 +189,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac
)
n += 1
- if limit and n > limit:
+ if limit and n >= limit:
break
orientation = (
diff --git a/kloppy/infra/serializers/tracking/tracab.py b/kloppy/infra/serializers/tracking/tracab.py
index acda4cbf..b33872bd 100644
--- a/kloppy/infra/serializers/tracking/tracab.py
+++ b/kloppy/infra/serializers/tracking/tracab.py
@@ -179,8 +179,7 @@ def _iter():
attacking_direction=attacking_direction_from_frame(frame)
)
- n += 1
- if limit and n > limit:
+ if limit and n >= limit:
break
orientation = (
diff --git a/kloppy/tests/test_helpers.py b/kloppy/tests/test_helpers.py
index bbb6b141..42c18666 100644
--- a/kloppy/tests/test_helpers.py
+++ b/kloppy/tests/test_helpers.py
@@ -1,13 +1,14 @@
import os
-from io import BytesIO
from pandas import DataFrame
from pandas.testing import assert_frame_equal
-from kloppy import MetricaTrackingSerializer, to_pandas, load_metrica_tracking_data, load_tracab_tracking_data, \
- TrackingDataSet, PitchDimensions, Dimension, Orientation, Frame, transform
-from kloppy.domain import Period, DataSetFlag, Point, AttackingDirection
-from kloppy.infra.utils import performance_logging
+from kloppy import to_pandas, load_metrica_tracking_data, load_tracab_tracking_data, transform
+from kloppy.domain import (
+ Period, DataSetFlag, Point, AttackingDirection,
+ TrackingDataSet, PitchDimensions, Dimension,
+ Orientation, Frame
+)
class TestHelpers:
diff --git a/setup.py b/setup.py
index d29f91f6..4d495c58 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
setup(
name='kloppy',
- version='0.2.1',
+ version='0.3.0',
author='Koen Vossen',
author_email='info@koenvossen.nl',
url="https://github.com/PySport/kloppy",
@@ -26,7 +26,8 @@
"Topic :: Scientific/Engineering"
],
install_requires=[
- 'lxml>=4.5.0'
+ 'lxml>=4.5.0',
+ 'requests>=2.0.0'
],
extras_require={
'test': [