diff --git a/.vscode/settings.json b/.vscode/settings.json index 02e2ab2c..27529ce4 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -26,7 +26,7 @@ "black-formatter.args": [ "--line-length", "88", - "--preview", + "--preview" ], "isort.args": [ "--profile", diff --git a/python/mlcroissant/mlcroissant/_src/core/git.py b/python/mlcroissant/mlcroissant/_src/core/git.py index 1367baa3..cc9fa589 100644 --- a/python/mlcroissant/mlcroissant/_src/core/git.py +++ b/python/mlcroissant/mlcroissant/_src/core/git.py @@ -37,7 +37,9 @@ def download_git_lfs_file(file: Path): try: repo.execute(["git", "lfs", "pull", "--include", fullpath]) except deps.git.exc.GitCommandError as ex: - raise RuntimeError("Problem when launching `git lfs`. " - "Possible problems: Have you installed git lfs " - f"locally? Is '{fullpath}' a valid `git lfs` " - "repository?") from ex + raise RuntimeError( + "Problem when launching `git lfs`. " + "Possible problems: Have you installed git lfs " + f"locally? Is '{fullpath}' a valid `git lfs` " + "repository?" + ) from ex diff --git a/python/mlcroissant/mlcroissant/_src/core/git_test.py b/python/mlcroissant/mlcroissant/_src/core/git_test.py index c1b15b18..611b0a7f 100644 --- a/python/mlcroissant/mlcroissant/_src/core/git_test.py +++ b/python/mlcroissant/mlcroissant/_src/core/git_test.py @@ -11,19 +11,15 @@ from mlcroissant._src.core.optional import deps from mlcroissant._src.core.path import Path -_GIT_LFS_CONTENT = ( - lambda: """version https://git-lfs.github.com/spec/v1 +_GIT_LFS_CONTENT = lambda: """version https://git-lfs.github.com/spec/v1 oid sha256:5e2785fcd9098567a49d6e62e328923d955b307b6dcd0492f6234e96e670772a size 309207547 """ -) -_NON_GIT_LFS_CONTENT = ( - lambda: """name,age +_NON_GIT_LFS_CONTENT = lambda: """name,age a,1 b,2 c,3""" -) _NON_ASCII_CONTENT = lambda: (255).to_bytes(1, byteorder="big") @@ -54,6 +50,6 @@ def test_download_git_lfs_file(): with mock.patch.object(git, "Git", autospec=True) as git_mock: download_git_lfs_file(file) git_mock.assert_called_once_with("/tmp/full/") - git_mock.return_value.execute.assert_called_once_with( - ["git", "lfs", "pull", "--include", "path.json"] - ) + git_mock.return_value.execute.assert_called_once_with([ + "git", "lfs", "pull", "--include", "path.json" + ]) diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py index 9017387c..a922f3e2 100644 --- a/python/mlcroissant/mlcroissant/_src/datasets_test.py +++ b/python/mlcroissant/mlcroissant/_src/datasets_test.py @@ -113,10 +113,16 @@ def test_hermetic_loading(dataset_name, record_set_name, num_records): @pytest.mark.parametrize( ["dataset_name", "record_set_name", "num_records"], [ - ["flores-200/metadata.json", - "language_translations_train_data_with_metadata", 10], - ["flores-200/metadata.json", - "language_translations_test_data_with_metadata", 10], + [ + "flores-200/metadata.json", + "language_translations_train_data_with_metadata", + 10, + ], + [ + "flores-200/metadata.json", + "language_translations_test_data_with_metadata", + 10, + ], ["gpt-3/metadata.json", "default", 10], ["huggingface-c4/metadata.json", "en", 1], ["huggingface-mnist/metadata.json", "default", 10], diff --git a/python/mlcroissant/mlcroissant/_src/nodes.py b/python/mlcroissant/mlcroissant/_src/nodes.py index 12f0fd40..e567839d 100644 --- a/python/mlcroissant/mlcroissant/_src/nodes.py +++ b/python/mlcroissant/mlcroissant/_src/nodes.py @@ -1,4 +1,5 @@ """Defines the public interface to the `mlcroissant.nodes` package.""" + from mlcroissant._src.structure_graph.nodes.field import Field from mlcroissant._src.structure_graph.nodes.file_object import FileObject from mlcroissant._src.structure_graph.nodes.file_set import FileSet diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/graph.py b/python/mlcroissant/mlcroissant/_src/operation_graph/graph.py index f8c16618..69d347e3 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/graph.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/graph.py @@ -94,9 +94,9 @@ def _add_operations_for_file_object( >> Concatenate(operations=operations, node=successor) ] if node.encoding_format and not should_extract(node.encoding_format): - fields = tuple( - [field for field in node.successors if isinstance(field, Field)] - ) + fields = tuple([ + field for field in node.successors if isinstance(field, Field) + ]) operation >> Read( operations=operations, node=node, diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/concatenate.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/concatenate.py index 37bba6d2..cf4ea8ea 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/concatenate.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/concatenate.py @@ -21,10 +21,8 @@ def __call__(self, *args: list[Path]) -> pd.DataFrame: """See class' docstring.""" assert len(args) > 0, "No dataframe to merge." files = [file for files in args for file in files] - return pd.DataFrame( - { - FileProperty.filepath: [os.fspath(file.filepath) for file in files], - FileProperty.filename: [file.filename for file in files], - FileProperty.fullpath: [os.fspath(file.fullpath) for file in files], - } - ) + return pd.DataFrame({ + FileProperty.filepath: [os.fspath(file.filepath) for file in files], + FileProperty.filename: [file.filename for file in files], + FileProperty.fullpath: [os.fspath(file.fullpath) for file in files], + }) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index bfa5acbc..724aa91d 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -64,9 +64,9 @@ def _extract_lines(row: pd.Series) -> pd.Series: """Reads a file line-by-line and outputs a named pd.Series of the lines.""" path = epath.Path(row[FileProperty.filepath]) lines = path.open("rb").read().splitlines() - return pd.Series( - {**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))} - ) + return pd.Series({ + **row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines)) + }) def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame: diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py index c2c9e568..914fbd40 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py @@ -97,11 +97,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame: return parse_json_content(json_content, self.fields) else: # Raw files are returned as a one-line pd.DataFrame. - return pd.DataFrame( - { - FileProperty.content: [json_content], - } - ) + return pd.DataFrame({ + FileProperty.content: [json_content], + }) elif encoding_format == EncodingFormat.JSON_LINES: return pd.read_json(file, lines=True) elif encoding_format == EncodingFormat.PARQUET: @@ -119,11 +117,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame: filepath, header=None, names=[FileProperty.lines] ) else: - return pd.DataFrame( - { - FileProperty.content: [file.read()], - } - ) + return pd.DataFrame({ + FileProperty.content: [file.read()], + }) else: raise ValueError( f"Unsupported encoding format for file: {encoding_format}" diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py index a573bddd..9945ef0f 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py @@ -41,12 +41,10 @@ def from_jsonld(cls, issues: Issues, json_) -> ParentField | None: def to_json(self) -> Json: """Converts the `ParentField` to JSON.""" - return remove_empty_values( - { - "references": self.references.to_json() if self.references else None, - "source": self.source.to_json() if self.source else None, - } - ) + return remove_empty_values({ + "references": self.references.to_json() if self.references else None, + "source": self.source.to_json() if self.source else None, + }) @dataclasses.dataclass(eq=False, repr=False) @@ -128,20 +126,18 @@ def to_json(self) -> Json: self.rdf.shorten_value(data_type) for data_type in self.data_types ] parent_field = self.parent_field.to_json() if self.parent_field else None - return remove_empty_values( - { - "@type": "ml:Field", - "name": self.name, - "description": self.description, - "dataType": data_types[0] if len(data_types) == 1 else data_types, - "isEnumeration": self.is_enumeration, - "parentField": parent_field, - "repeated": self.repeated, - "references": self.references.to_json() if self.references else None, - "source": self.source.to_json() if self.source else None, - "subField": [sub_field.to_json() for sub_field in self.sub_fields], - } - ) + return remove_empty_values({ + "@type": "ml:Field", + "name": self.name, + "description": self.description, + "dataType": data_types[0] if len(data_types) == 1 else data_types, + "isEnumeration": self.is_enumeration, + "parentField": parent_field, + "repeated": self.repeated, + "references": self.references.to_json() if self.references else None, + "source": self.source.to_json() if self.source else None, + "subField": [sub_field.to_json() for sub_field in self.sub_fields], + }) @classmethod def from_jsonld( diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py index 1565bea2..255083e3 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py @@ -45,20 +45,18 @@ def to_json(self) -> Json: contained_in: str | list[str] = self.contained_in[0] else: contained_in = self.contained_in - return remove_empty_values( - { - "@type": "sc:FileObject", - "name": self.name, - "description": self.description, - "contentSize": self.content_size, - "contentUrl": self.content_url, - "containedIn": contained_in, - "encodingFormat": self.encoding_format, - "md5": self.md5, - "sha256": self.sha256, - "source": self.source.to_json() if self.source else None, - } - ) + return remove_empty_values({ + "@type": "sc:FileObject", + "name": self.name, + "description": self.description, + "contentSize": self.content_size, + "contentUrl": self.content_url, + "containedIn": contained_in, + "encodingFormat": self.encoding_format, + "md5": self.md5, + "sha256": self.sha256, + "source": self.source.to_json() if self.source else None, + }) @classmethod def from_jsonld( diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object_test.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object_test.py index edc58412..5bc919c5 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object_test.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object_test.py @@ -25,9 +25,9 @@ def test_checks_are_performed(): Node, "assert_has_exclusive_properties" ) as exclusive_mock: create_test_node(FileObject) - mandatory_mock.assert_has_calls( - [mock.call("encoding_format", "name"), mock.call("content_url")] - ) + mandatory_mock.assert_has_calls([ + mock.call("encoding_format", "name"), mock.call("content_url") + ]) exclusive_mock.assert_called_once_with(["md5", "sha256"]) validate_name_mock.assert_called_once() diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py index f40899af..46f8aa7c 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py @@ -37,16 +37,14 @@ def to_json(self) -> Json: contained_in: str | list[str] = self.contained_in[0] else: contained_in = self.contained_in - return remove_empty_values( - { - "@type": "sc:FileSet", - "name": self.name, - "description": self.description, - "containedIn": contained_in, - "encodingFormat": self.encoding_format, - "includes": self.includes, - } - ) + return remove_empty_values({ + "@type": "sc:FileSet", + "name": self.name, + "description": self.description, + "containedIn": contained_in, + "encodingFormat": self.encoding_format, + "includes": self.includes, + }) @classmethod def from_jsonld( diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py index 7c04f44d..02b0ce6a 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py @@ -64,19 +64,17 @@ def __post_init__(self): def to_json(self) -> Json: """Converts the `Metadata` to JSON.""" - return remove_empty_values( - { - "@context": self.rdf.context, - "@type": "sc:Dataset", - "name": self.name, - "description": self.description, - "citation": self.citation, - "license": self.license, - "url": self.url, - "distribution": [f.to_json() for f in self.distribution], - "recordSet": [record_set.to_json() for record_set in self.record_sets], - } - ) + return remove_empty_values({ + "@context": self.rdf.context, + "@type": "sc:Dataset", + "name": self.name, + "description": self.description, + "citation": self.citation, + "license": self.license, + "url": self.url, + "distribution": [f.to_json() for f in self.distribution], + "recordSet": [record_set.to_json() for record_set in self.record_sets], + }) @property def file_objects(self) -> list[FileObject]: diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py index 4b7e7937..09287a67 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py @@ -68,17 +68,15 @@ def __post_init__(self): def to_json(self) -> Json: """Converts the `RecordSet` to JSON.""" - return remove_empty_values( - { - "@type": "ml:RecordSet", - "name": self.name, - "description": self.description, - "isEnumeration": self.is_enumeration, - "key": self.key, - "field": [field.to_json() for field in self.fields], - "data": self.data, - } - ) + return remove_empty_values({ + "@type": "ml:RecordSet", + "name": self.name, + "description": self.description, + "isEnumeration": self.is_enumeration, + "key": self.key, + "field": [field.to_json() for field in self.fields], + "data": self.data, + }) @classmethod def from_jsonld( diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source.py index 0a5781fc..f7d060b4 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source.py @@ -70,13 +70,11 @@ class Extract: def to_json(self) -> Json: """Converts the `Extract` to JSON.""" - return remove_empty_values( - { - "column": self.column, - "fileProperty": self.file_property.name if self.file_property else None, - "jsonPath": self.json_path, - } - ) + return remove_empty_values({ + "column": self.column, + "fileProperty": self.file_property.name if self.file_property else None, + "jsonPath": self.json_path, + }) @dataclasses.dataclass(frozen=True) @@ -100,15 +98,13 @@ class Transform: def to_json(self) -> Json: """Converts the `Transform` to JSON.""" - return remove_empty_values( - { - "format": self.format, - "jsonPath": self.json_path, - "regex": self.regex, - "replace": self.replace, - "separator": self.separator, - } - ) + return remove_empty_values({ + "format": self.format, + "jsonPath": self.json_path, + "regex": self.regex, + "replace": self.replace, + "separator": self.separator, + }) @classmethod def from_jsonld(cls, issues: Issues, jsonld: Json | list[Json]) -> list[Transform]: @@ -196,13 +192,11 @@ def to_json(self) -> Json: transforms = [transform.to_json() for transform in self.transforms] if self.node_type is None: raise ValueError("node_type should be `distribution` or `field`. Got: None") - return remove_empty_values( - { - self.node_type: self.uid, - "extract": self.extract.to_json(), - "transform": transforms[0] if len(transforms) == 1 else transforms, - } - ) + return remove_empty_values({ + self.node_type: self.uid, + "extract": self.extract.to_json(), + "transform": transforms[0] if len(transforms) == 1 else transforms, + }) @classmethod def from_jsonld(cls, issues: Issues, jsonld: Any) -> Source: diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source_test.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source_test.py index e8b607dc..1538b2a8 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source_test.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source_test.py @@ -103,29 +103,25 @@ def test_source_parses_list(json_ld, expected_source): [ [ "this is not a list", - set( - [ - 'Transform "this is not a list" should be a dict with the keys' - " http://mlcommons.org/schema/format," - " http://mlcommons.org/schema/jsonPath," - " http://mlcommons.org/schema/regex," - " http://mlcommons.org/schema/replace," - " http://mlcommons.org/schema/separator" - ] - ), + set([ + 'Transform "this is not a list" should be a dict with the keys' + " http://mlcommons.org/schema/format," + " http://mlcommons.org/schema/jsonPath," + " http://mlcommons.org/schema/regex," + " http://mlcommons.org/schema/replace," + " http://mlcommons.org/schema/separator" + ]), ], [ [{"not": "the right keys"}], - set( - [ - "Transform \"{'not': 'the right keys'}\" should be a dict with at" - " least one key in http://mlcommons.org/schema/format," - " http://mlcommons.org/schema/jsonPath," - " http://mlcommons.org/schema/regex," - " http://mlcommons.org/schema/replace," - " http://mlcommons.org/schema/separator" - ] - ), + set([ + "Transform \"{'not': 'the right keys'}\" should be a dict with at" + " least one key in http://mlcommons.org/schema/format," + " http://mlcommons.org/schema/jsonPath," + " http://mlcommons.org/schema/regex," + " http://mlcommons.org/schema/replace," + " http://mlcommons.org/schema/separator" + ]), ], ], ) diff --git a/python/mlcroissant/mlcroissant/_src/tests/records_test.py b/python/mlcroissant/mlcroissant/_src/tests/records_test.py index 1051b1c7..2a3df72e 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/records_test.py +++ b/python/mlcroissant/mlcroissant/_src/tests/records_test.py @@ -6,26 +6,22 @@ def test_record_to_python(): - assert record_to_python( - { - "key1": 1, - "key2": { - "key3": pd.Timestamp("2017-01-01T12"), - "key4": {"key5": b"foo", "key6": float("nan")}, - }, - } - ) == { + assert record_to_python({ + "key1": 1, + "key2": { + "key3": pd.Timestamp("2017-01-01T12"), + "key4": {"key5": b"foo", "key6": float("nan")}, + }, + }) == { "key1": 1, "key2": {"key3": "2017-01-01 12:00:00", "key4": {"key5": "foo", "key6": None}}, } - assert record_to_python( - { - "image": ( - "" - ) - } - ) == { + assert record_to_python({ + "image": ( + "" + ) + }) == { "image": ( ">" diff --git a/python/mlcroissant/mlcroissant/scripts/cli.py b/python/mlcroissant/mlcroissant/scripts/cli.py index 7fc74614..afbe051d 100644 --- a/python/mlcroissant/mlcroissant/scripts/cli.py +++ b/python/mlcroissant/mlcroissant/scripts/cli.py @@ -14,13 +14,11 @@ class Commands: VALIDATE = "validate" -choices = set( - [ - Commands.FROM_HUGGINGFACE_TO_CROISSANT, - Commands.LOAD, - Commands.VALIDATE, - ] -) +choices = set([ + Commands.FROM_HUGGINGFACE_TO_CROISSANT, + Commands.LOAD, + Commands.VALIDATE, +]) def main():