diff --git a/CHANGES.txt b/CHANGES.txt index 88eab272..b7776dff 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,3 +2,8 @@ v0.1.0, 2020-04-23 -- Initial release. v0.2.0, 2020-05-05 -- Change interface of TrackingDataSerializer Add Metrica Tracking Serializer including automated tests Cleanup some import statements +v0.2.1, 2020-05-12 -- Add some helpers functions to directly load a dataset by filenames +v0.3.0, 2020-05-15 -- Add FIFA EPTS Tracking data loader + Add some examples + Add datasets loader to directly load dataset from your python code + Add limit argument to all loaders diff --git a/README.md b/README.md index 000539e0..a63e58b5 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,13 @@ different tracking- and event data like a breeze. It aims to be the fundamental ## Main Features Here are just a few of the things that kloppy does well: +- Directly load [**Public datasets**](#datasets) to get started right away. - Understandable [**Standardized models**](#models) for tracking- and event datasets - Out-of-the-box [**(De)serializing**](#serializing) tracking- and event data from different source into standardized models and visa-versa - Flexible [**pitch dimensions**](#pitch-dimensions) transformer for changing a dataset pitch dimensions from one to another (eg OPTA's 100x100 -> TRACAB meters) - Intelligent [**orientation**](#orientation) transforming orientation of a dataset (eg from TRACAB fixed orientation to "Home Team" orientation) + ## Where to get it The source code is currently hosted on GitHub at: https://github.com/PySport/kloppy @@ -50,7 +52,15 @@ data_set = load_epts_tracking_data('meta.xml', 'raw_data.txt') data_set = transform(data_set, pitch_dimensions=[[0, 108], [-34, 34]]) pandas_data_frame = to_pandas(data_set) +``` + +### Public datasets / Very quick start +More and more companies are publishing (demo) datasets to get you started. Inspired by the `tensorflow_datasets` package, +we added a "dataset loader" which does all the heavy lifting for you: find urls, download files, organize and load them. +```python +from kloppy import datasets +data_set = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10}) ``` ### Standardized models diff --git a/examples/datasets/metrica.py b/examples/datasets/metrica.py new file mode 100644 index 00000000..d0322a6b --- /dev/null +++ b/examples/datasets/metrica.py @@ -0,0 +1,22 @@ +from kloppy import datasets, to_pandas + + +def main(): + """ + This example shows the use of Metrica datasets, and how we can pass argument + to the dataset loader. + """ + + # The metrica dataset loader loads by default the 'game1' dataset + data_set = datasets.load("metrica_tracking", options={'sample_rate': 1./12, 'limit': 10}) + print(len(data_set.frames)) + + # We can pass additional keyword arguments to the loaders to specify a different dataset + data_set = datasets.load("metrica_tracking", options={'limit': 1000}, game='game2') + + data_frame = to_pandas(data_set) + print(data_frame) + + +if __name__ == "__main__": + main() diff --git a/kloppy/__init__.py b/kloppy/__init__.py index de58587a..f7ea039b 100644 --- a/kloppy/__init__.py +++ b/kloppy/__init__.py @@ -1,2 +1,3 @@ from .infra.serializers import * from .helpers import * +from .infra import datasets diff --git a/kloppy/helpers.py b/kloppy/helpers.py index 6007936d..a48176f6 100644 --- a/kloppy/helpers.py +++ b/kloppy/helpers.py @@ -103,3 +103,12 @@ def to_pandas(data_set: DataSet, _record_converter: Callable = None) -> 'DataFra return pd.DataFrame.from_records( map(_record_converter, data_set.records) ) + + +__all__ = [ + 'load_tracab_tracking_data', + 'load_metrica_tracking_data', + 'load_epts_tracking_data', + 'to_pandas', + 'transform' +] diff --git a/kloppy/infra/datasets/__init__.py b/kloppy/infra/datasets/__init__.py new file mode 100644 index 00000000..370a5b63 --- /dev/null +++ b/kloppy/infra/datasets/__init__.py @@ -0,0 +1,8 @@ +# import for registration +from . import tracking + +from .core.loading import load + +__all__ = [ + 'load' +] diff --git a/kloppy/infra/datasets/core/__init__.py b/kloppy/infra/datasets/core/__init__.py new file mode 100644 index 00000000..c9b1f513 --- /dev/null +++ b/kloppy/infra/datasets/core/__init__.py @@ -0,0 +1 @@ +from .builder import DatasetBuilder diff --git a/kloppy/infra/datasets/core/builder.py b/kloppy/infra/datasets/core/builder.py new file mode 100644 index 00000000..84e074ab --- /dev/null +++ b/kloppy/infra/datasets/core/builder.py @@ -0,0 +1,15 @@ +from abc import abstractmethod +from typing import Dict, Type, Union + +from ...serializers.tracking import TrackingDataSerializer +from .registered import RegisteredDataset + + +class DatasetBuilder(metaclass=RegisteredDataset): + @abstractmethod + def get_data_set_files(self, **kwargs) -> Dict[str, Dict[str, str]]: + raise NotImplementedError + + @abstractmethod + def get_serializer_cls(self) -> Union[Type[TrackingDataSerializer]]: + raise NotImplementedError diff --git a/kloppy/infra/datasets/core/loading.py b/kloppy/infra/datasets/core/loading.py new file mode 100644 index 00000000..c0005145 --- /dev/null +++ b/kloppy/infra/datasets/core/loading.py @@ -0,0 +1,67 @@ +import os + +import requests + +from typing import Dict, Union + +from kloppy.domain import DataSet, TrackingDataSet + +from .registered import _DATASET_REGISTRY + + +def download_file(url, local_filename): + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + +def get_local_files(data_set_name: str, files: Dict[str, str]) -> Dict[str, str]: + datasets_base_dir = os.environ.get('KLOPPY_BASE_DIR', None) + if not datasets_base_dir: + datasets_base_dir = os.path.expanduser('~/kloppy_datasets') + + dataset_base_dir = f'{datasets_base_dir}/{data_set_name}' + if not os.path.exists(dataset_base_dir): + os.makedirs(dataset_base_dir) + + local_files = {} + for file_key, file_url in files.items(): + filename = file_url.split('/')[-1] + local_filename = f'{dataset_base_dir}/{filename}' + if not os.path.exists(local_filename): + print(f'Downloading {filename}...') + download_file(file_url, local_filename) + print('Done') + local_files[file_key] = local_filename + return local_files + + +def load(data_set_name: str, options=None, **dataset_kwargs) -> Union[TrackingDataSet]: + if data_set_name not in _DATASET_REGISTRY: + raise ValueError(f"Dataset {data_set_name} not found") + + builder_cls = _DATASET_REGISTRY[data_set_name] + builder = builder_cls() + + dataset_remote_files = builder.get_data_set_files(**dataset_kwargs) + dataset_local_files = get_local_files(data_set_name, dataset_remote_files) + + file_handlers = { + local_file_key: open(local_file_name, 'rb') + for local_file_key, local_file_name + in dataset_local_files.items() + } + + try: + serializer_cls = builder.get_serializer_cls() + serializer = serializer_cls() + data_set = serializer.deserialize( + inputs=file_handlers, + options=options + ) + finally: + for fp in file_handlers.values(): + fp.close() + return data_set diff --git a/kloppy/infra/datasets/core/registered.py b/kloppy/infra/datasets/core/registered.py new file mode 100644 index 00000000..35940332 --- /dev/null +++ b/kloppy/infra/datasets/core/registered.py @@ -0,0 +1,27 @@ +import inspect +import re +import abc +from typing import Type, Dict + + +_first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)") +_all_cap_re = re.compile("([a-z0-9])([A-Z])") + +# from .builder import DatasetBuilder +_DATASET_REGISTRY: Dict[str, Type['DatasetBuilder']] = {} + + +def camelcase_to_snakecase(name): + """Convert camel-case string to snake-case.""" + s1 = _first_cap_re.sub(r"\1_\2", name) + return _all_cap_re.sub(r"\1_\2", s1).lower() + + +class RegisteredDataset(abc.ABCMeta): + def __new__(mcs, cls_name, bases, class_dict): + name = camelcase_to_snakecase(cls_name) + class_dict["name"] = name + builder_cls = super(RegisteredDataset, mcs).__new__(mcs, cls_name, bases, class_dict) + if not inspect.isabstract(builder_cls): + _DATASET_REGISTRY[name] = builder_cls + return builder_cls diff --git a/kloppy/infra/datasets/tracking/__init__.py b/kloppy/infra/datasets/tracking/__init__.py new file mode 100644 index 00000000..659d3352 --- /dev/null +++ b/kloppy/infra/datasets/tracking/__init__.py @@ -0,0 +1 @@ +from .metrica import MetricaTracking diff --git a/kloppy/infra/datasets/tracking/metrica.py b/kloppy/infra/datasets/tracking/metrica.py new file mode 100644 index 00000000..a296bb2c --- /dev/null +++ b/kloppy/infra/datasets/tracking/metrica.py @@ -0,0 +1,25 @@ +from typing import Dict, Type + +from ..core.builder import DatasetBuilder +from ...serializers.tracking import TrackingDataSerializer, MetricaTrackingSerializer + + +_DATASET_URLS = { + 'game1': { + 'raw_data_home': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_1/Sample_Game_1_RawTrackingData_Home_Team.csv', + 'raw_data_away': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_1/Sample_Game_1_RawTrackingData_Away_Team.csv' + }, + 'game2': { + 'raw_data_home': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_2/Sample_Game_2_RawTrackingData_Home_Team.csv', + 'raw_data_away': 'https://raw.githubusercontent.com/metrica-sports/sample-data/master/data/Sample_Game_2/Sample_Game_2_RawTrackingData_Away_Team.csv' + } +} + + +class MetricaTracking(DatasetBuilder): + def get_data_set_files(self,**kwargs) -> Dict[str, str]: + game = kwargs.get('game', 'game1') + return _DATASET_URLS[game] + + def get_serializer_cls(self) -> Type[TrackingDataSerializer]: + return MetricaTrackingSerializer diff --git a/kloppy/infra/serializers/tracking/epts/reader.py b/kloppy/infra/serializers/tracking/epts/reader.py index 2ff02ac3..5006c8bb 100644 --- a/kloppy/infra/serializers/tracking/epts/reader.py +++ b/kloppy/infra/serializers/tracking/epts/reader.py @@ -87,7 +87,7 @@ def _set_current_data_spec(idx): yield row n += 1 - if limit and n > limit: + if limit and n >= limit: break if frame_id >= end_frame_id: diff --git a/kloppy/infra/serializers/tracking/metrica.py b/kloppy/infra/serializers/tracking/metrica.py index 63e31a99..533a8dcd 100644 --- a/kloppy/infra/serializers/tracking/metrica.py +++ b/kloppy/infra/serializers/tracking/metrica.py @@ -189,7 +189,7 @@ def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> Trac ) n += 1 - if limit and n > limit: + if limit and n >= limit: break orientation = ( diff --git a/kloppy/infra/serializers/tracking/tracab.py b/kloppy/infra/serializers/tracking/tracab.py index acda4cbf..b33872bd 100644 --- a/kloppy/infra/serializers/tracking/tracab.py +++ b/kloppy/infra/serializers/tracking/tracab.py @@ -179,8 +179,7 @@ def _iter(): attacking_direction=attacking_direction_from_frame(frame) ) - n += 1 - if limit and n > limit: + if limit and n >= limit: break orientation = ( diff --git a/kloppy/tests/test_helpers.py b/kloppy/tests/test_helpers.py index bbb6b141..42c18666 100644 --- a/kloppy/tests/test_helpers.py +++ b/kloppy/tests/test_helpers.py @@ -1,13 +1,14 @@ import os -from io import BytesIO from pandas import DataFrame from pandas.testing import assert_frame_equal -from kloppy import MetricaTrackingSerializer, to_pandas, load_metrica_tracking_data, load_tracab_tracking_data, \ - TrackingDataSet, PitchDimensions, Dimension, Orientation, Frame, transform -from kloppy.domain import Period, DataSetFlag, Point, AttackingDirection -from kloppy.infra.utils import performance_logging +from kloppy import to_pandas, load_metrica_tracking_data, load_tracab_tracking_data, transform +from kloppy.domain import ( + Period, DataSetFlag, Point, AttackingDirection, + TrackingDataSet, PitchDimensions, Dimension, + Orientation, Frame +) class TestHelpers: diff --git a/setup.py b/setup.py index d29f91f6..4d495c58 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name='kloppy', - version='0.2.1', + version='0.3.0', author='Koen Vossen', author_email='info@koenvossen.nl', url="https://github.com/PySport/kloppy", @@ -26,7 +26,8 @@ "Topic :: Scientific/Engineering" ], install_requires=[ - 'lxml>=4.5.0' + 'lxml>=4.5.0', + 'requests>=2.0.0' ], extras_require={ 'test': [