Skip to content

Commit

Permalink
Format with black.
Browse files Browse the repository at this point in the history
  • Loading branch information
marcenacp committed Nov 10, 2023
1 parent cc1ca94 commit 26bee88
Show file tree
Hide file tree
Showing 19 changed files with 150 additions and 179 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"black-formatter.args": [
"--line-length",
"88",
"--preview",
"--preview"
],
"isort.args": [
"--profile",
Expand Down
10 changes: 6 additions & 4 deletions python/mlcroissant/mlcroissant/_src/core/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ def download_git_lfs_file(file: Path):
try:
repo.execute(["git", "lfs", "pull", "--include", fullpath])
except deps.git.exc.GitCommandError as ex:
raise RuntimeError("Problem when launching `git lfs`. "
"Possible problems: Have you installed git lfs "
f"locally? Is '{fullpath}' a valid `git lfs` "
"repository?") from ex
raise RuntimeError(
"Problem when launching `git lfs`. "
"Possible problems: Have you installed git lfs "
f"locally? Is '{fullpath}' a valid `git lfs` "
"repository?"
) from ex
14 changes: 5 additions & 9 deletions python/mlcroissant/mlcroissant/_src/core/git_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,15 @@
from mlcroissant._src.core.optional import deps
from mlcroissant._src.core.path import Path

_GIT_LFS_CONTENT = (
lambda: """version https://git-lfs.github.com/spec/v1
_GIT_LFS_CONTENT = lambda: """version https://git-lfs.github.com/spec/v1
oid sha256:5e2785fcd9098567a49d6e62e328923d955b307b6dcd0492f6234e96e670772a
size 309207547
"""
)

_NON_GIT_LFS_CONTENT = (
lambda: """name,age
_NON_GIT_LFS_CONTENT = lambda: """name,age
a,1
b,2
c,3"""
)

_NON_ASCII_CONTENT = lambda: (255).to_bytes(1, byteorder="big")

Expand Down Expand Up @@ -54,6 +50,6 @@ def test_download_git_lfs_file():
with mock.patch.object(git, "Git", autospec=True) as git_mock:
download_git_lfs_file(file)
git_mock.assert_called_once_with("/tmp/full/")
git_mock.return_value.execute.assert_called_once_with(
["git", "lfs", "pull", "--include", "path.json"]
)
git_mock.return_value.execute.assert_called_once_with([
"git", "lfs", "pull", "--include", "path.json"
])
14 changes: 10 additions & 4 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,16 @@ def test_hermetic_loading(dataset_name, record_set_name, num_records):
@pytest.mark.parametrize(
["dataset_name", "record_set_name", "num_records"],
[
["flores-200/metadata.json",
"language_translations_train_data_with_metadata", 10],
["flores-200/metadata.json",
"language_translations_test_data_with_metadata", 10],
[
"flores-200/metadata.json",
"language_translations_train_data_with_metadata",
10,
],
[
"flores-200/metadata.json",
"language_translations_test_data_with_metadata",
10,
],
["gpt-3/metadata.json", "default", 10],
["huggingface-c4/metadata.json", "en", 1],
["huggingface-mnist/metadata.json", "default", 10],
Expand Down
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/nodes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Defines the public interface to the `mlcroissant.nodes` package."""

from mlcroissant._src.structure_graph.nodes.field import Field
from mlcroissant._src.structure_graph.nodes.file_object import FileObject
from mlcroissant._src.structure_graph.nodes.file_set import FileSet
Expand Down
6 changes: 3 additions & 3 deletions python/mlcroissant/mlcroissant/_src/operation_graph/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ def _add_operations_for_file_object(
>> Concatenate(operations=operations, node=successor)
]
if node.encoding_format and not should_extract(node.encoding_format):
fields = tuple(
[field for field in node.successors if isinstance(field, Field)]
)
fields = tuple([
field for field in node.successors if isinstance(field, Field)
])
operation >> Read(
operations=operations,
node=node,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ def __call__(self, *args: list[Path]) -> pd.DataFrame:
"""See class' docstring."""
assert len(args) > 0, "No dataframe to merge."
files = [file for files in args for file in files]
return pd.DataFrame(
{
FileProperty.filepath: [os.fspath(file.filepath) for file in files],
FileProperty.filename: [file.filename for file in files],
FileProperty.fullpath: [os.fspath(file.fullpath) for file in files],
}
)
return pd.DataFrame({
FileProperty.filepath: [os.fspath(file.filepath) for file in files],
FileProperty.filename: [file.filename for file in files],
FileProperty.fullpath: [os.fspath(file.fullpath) for file in files],
})
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ def _extract_lines(row: pd.Series) -> pd.Series:
"""Reads a file line-by-line and outputs a named pd.Series of the lines."""
path = epath.Path(row[FileProperty.filepath])
lines = path.open("rb").read().splitlines()
return pd.Series(
{**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))}
)
return pd.Series({
**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))
})


def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
return parse_json_content(json_content, self.fields)
else:
# Raw files are returned as a one-line pd.DataFrame.
return pd.DataFrame(
{
FileProperty.content: [json_content],
}
)
return pd.DataFrame({
FileProperty.content: [json_content],
})
elif encoding_format == EncodingFormat.JSON_LINES:
return pd.read_json(file, lines=True)
elif encoding_format == EncodingFormat.PARQUET:
Expand All @@ -119,11 +117,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
filepath, header=None, names=[FileProperty.lines]
)
else:
return pd.DataFrame(
{
FileProperty.content: [file.read()],
}
)
return pd.DataFrame({
FileProperty.content: [file.read()],
})
else:
raise ValueError(
f"Unsupported encoding format for file: {encoding_format}"
Expand Down
36 changes: 16 additions & 20 deletions python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,10 @@ def from_jsonld(cls, issues: Issues, json_) -> ParentField | None:

def to_json(self) -> Json:
"""Converts the `ParentField` to JSON."""
return remove_empty_values(
{
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
}
)
return remove_empty_values({
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
})


@dataclasses.dataclass(eq=False, repr=False)
Expand Down Expand Up @@ -128,20 +126,18 @@ def to_json(self) -> Json:
self.rdf.shorten_value(data_type) for data_type in self.data_types
]
parent_field = self.parent_field.to_json() if self.parent_field else None
return remove_empty_values(
{
"@type": "ml:Field",
"name": self.name,
"description": self.description,
"dataType": data_types[0] if len(data_types) == 1 else data_types,
"isEnumeration": self.is_enumeration,
"parentField": parent_field,
"repeated": self.repeated,
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
"subField": [sub_field.to_json() for sub_field in self.sub_fields],
}
)
return remove_empty_values({
"@type": "ml:Field",
"name": self.name,
"description": self.description,
"dataType": data_types[0] if len(data_types) == 1 else data_types,
"isEnumeration": self.is_enumeration,
"parentField": parent_field,
"repeated": self.repeated,
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
"subField": [sub_field.to_json() for sub_field in self.sub_fields],
})

@classmethod
def from_jsonld(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,18 @@ def to_json(self) -> Json:
contained_in: str | list[str] = self.contained_in[0]
else:
contained_in = self.contained_in
return remove_empty_values(
{
"@type": "sc:FileObject",
"name": self.name,
"description": self.description,
"contentSize": self.content_size,
"contentUrl": self.content_url,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"md5": self.md5,
"sha256": self.sha256,
"source": self.source.to_json() if self.source else None,
}
)
return remove_empty_values({
"@type": "sc:FileObject",
"name": self.name,
"description": self.description,
"contentSize": self.content_size,
"contentUrl": self.content_url,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"md5": self.md5,
"sha256": self.sha256,
"source": self.source.to_json() if self.source else None,
})

@classmethod
def from_jsonld(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ def test_checks_are_performed():
Node, "assert_has_exclusive_properties"
) as exclusive_mock:
create_test_node(FileObject)
mandatory_mock.assert_has_calls(
[mock.call("encoding_format", "name"), mock.call("content_url")]
)
mandatory_mock.assert_has_calls([
mock.call("encoding_format", "name"), mock.call("content_url")
])
exclusive_mock.assert_called_once_with(["md5", "sha256"])
validate_name_mock.assert_called_once()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,14 @@ def to_json(self) -> Json:
contained_in: str | list[str] = self.contained_in[0]
else:
contained_in = self.contained_in
return remove_empty_values(
{
"@type": "sc:FileSet",
"name": self.name,
"description": self.description,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"includes": self.includes,
}
)
return remove_empty_values({
"@type": "sc:FileSet",
"name": self.name,
"description": self.description,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"includes": self.includes,
})

@classmethod
def from_jsonld(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,17 @@ def __post_init__(self):

def to_json(self) -> Json:
"""Converts the `Metadata` to JSON."""
return remove_empty_values(
{
"@context": self.rdf.context,
"@type": "sc:Dataset",
"name": self.name,
"description": self.description,
"citation": self.citation,
"license": self.license,
"url": self.url,
"distribution": [f.to_json() for f in self.distribution],
"recordSet": [record_set.to_json() for record_set in self.record_sets],
}
)
return remove_empty_values({
"@context": self.rdf.context,
"@type": "sc:Dataset",
"name": self.name,
"description": self.description,
"citation": self.citation,
"license": self.license,
"url": self.url,
"distribution": [f.to_json() for f in self.distribution],
"recordSet": [record_set.to_json() for record_set in self.record_sets],
})

@property
def file_objects(self) -> list[FileObject]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,15 @@ def __post_init__(self):

def to_json(self) -> Json:
"""Converts the `RecordSet` to JSON."""
return remove_empty_values(
{
"@type": "ml:RecordSet",
"name": self.name,
"description": self.description,
"isEnumeration": self.is_enumeration,
"key": self.key,
"field": [field.to_json() for field in self.fields],
"data": self.data,
}
)
return remove_empty_values({
"@type": "ml:RecordSet",
"name": self.name,
"description": self.description,
"isEnumeration": self.is_enumeration,
"key": self.key,
"field": [field.to_json() for field in self.fields],
"data": self.data,
})

@classmethod
def from_jsonld(
Expand Down
40 changes: 17 additions & 23 deletions python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,11 @@ class Extract:

def to_json(self) -> Json:
"""Converts the `Extract` to JSON."""
return remove_empty_values(
{
"column": self.column,
"fileProperty": self.file_property.name if self.file_property else None,
"jsonPath": self.json_path,
}
)
return remove_empty_values({
"column": self.column,
"fileProperty": self.file_property.name if self.file_property else None,
"jsonPath": self.json_path,
})


@dataclasses.dataclass(frozen=True)
Expand All @@ -100,15 +98,13 @@ class Transform:

def to_json(self) -> Json:
"""Converts the `Transform` to JSON."""
return remove_empty_values(
{
"format": self.format,
"jsonPath": self.json_path,
"regex": self.regex,
"replace": self.replace,
"separator": self.separator,
}
)
return remove_empty_values({
"format": self.format,
"jsonPath": self.json_path,
"regex": self.regex,
"replace": self.replace,
"separator": self.separator,
})

@classmethod
def from_jsonld(cls, issues: Issues, jsonld: Json | list[Json]) -> list[Transform]:
Expand Down Expand Up @@ -196,13 +192,11 @@ def to_json(self) -> Json:
transforms = [transform.to_json() for transform in self.transforms]
if self.node_type is None:
raise ValueError("node_type should be `distribution` or `field`. Got: None")
return remove_empty_values(
{
self.node_type: self.uid,
"extract": self.extract.to_json(),
"transform": transforms[0] if len(transforms) == 1 else transforms,
}
)
return remove_empty_values({
self.node_type: self.uid,
"extract": self.extract.to_json(),
"transform": transforms[0] if len(transforms) == 1 else transforms,
})

@classmethod
def from_jsonld(cls, issues: Issues, jsonld: Any) -> Source:
Expand Down
Loading

0 comments on commit 26bee88

Please sign in to comment.