Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add automatic quality checks in GitHub Actions for the editor. #321

Merged
merged 6 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Continuous Integration (CI)
name: Continuous Integration - mlcroissant (CI)

on:
push:
Expand Down Expand Up @@ -92,7 +92,7 @@ jobs:
python-version: '3.11'
- uses: psf/black@stable
with:
options: --line-length 88 --preview --experimental-string-processing
options: --check --line-length 88 --preview
src: './python/mlcroissant'
- uses: isort/isort-action@master
with:
Expand Down
29 changes: 29 additions & 0 deletions .github/workflows/editor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Continuous Integration - Editor (CI)

on:
push:
branches:
- main
pull_request:
branches:
- main
merge_group:

jobs:
python-format:
name: Python format
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.11'
- uses: psf/black@stable
with:
options: --check --line-length 88 --preview
src: './wizard'
- uses: isort/isort-action@master
with:
sort-paths: './wizard'
configuration: --profile google --line-length 88 --use-parentheses --project mlcroissant --project components --project core --project views --project state --project utils --multi-line 3 --thirdparty datasets
requirements-files: 'wizard/requirements.txt'
15 changes: 13 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,28 @@
"black-formatter.args": [
"--line-length",
"88",
"--preview",
"--experimental-string-processing"
"--preview"
],
"isort.args": [
"--profile",
"google",
"--line-length",
"88",
"--use-parentheses",
// For python/mlcroissant/
"--project",
"mlcroissant",
// For wizard/
"--project",
"components",
"--project",
"core",
"--project",
"views",
"--project",
"state",
"--project",
"utils",
"--multi-line",
"3",
"--thirdparty",
Expand Down
10 changes: 6 additions & 4 deletions python/mlcroissant/mlcroissant/_src/core/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ def download_git_lfs_file(file: Path):
try:
repo.execute(["git", "lfs", "pull", "--include", fullpath])
except deps.git.exc.GitCommandError as ex:
raise RuntimeError("Problem when launching `git lfs`. "
"Possible problems: Have you installed git lfs "
f"locally? Is '{fullpath}' a valid `git lfs` "
"repository?") from ex
raise RuntimeError(
"Problem when launching `git lfs`. "
"Possible problems: Have you installed git lfs "
f"locally? Is '{fullpath}' a valid `git lfs` "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be more readable if it would display Possible problems:\n\t* Have you installed ...?\n\t* Is '{fullpath}' ...?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR only aims at reformating the code using the last version of black

"repository?"
) from ex
14 changes: 5 additions & 9 deletions python/mlcroissant/mlcroissant/_src/core/git_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,15 @@
from mlcroissant._src.core.optional import deps
from mlcroissant._src.core.path import Path

_GIT_LFS_CONTENT = (
lambda: """version https://git-lfs.github.com/spec/v1
_GIT_LFS_CONTENT = lambda: """version https://git-lfs.github.com/spec/v1
oid sha256:5e2785fcd9098567a49d6e62e328923d955b307b6dcd0492f6234e96e670772a
size 309207547
"""
)

_NON_GIT_LFS_CONTENT = (
lambda: """name,age
_NON_GIT_LFS_CONTENT = lambda: """name,age
a,1
b,2
c,3"""
)

_NON_ASCII_CONTENT = lambda: (255).to_bytes(1, byteorder="big")

Expand Down Expand Up @@ -54,6 +50,6 @@ def test_download_git_lfs_file():
with mock.patch.object(git, "Git", autospec=True) as git_mock:
download_git_lfs_file(file)
git_mock.assert_called_once_with("/tmp/full/")
git_mock.return_value.execute.assert_called_once_with(
["git", "lfs", "pull", "--include", "path.json"]
)
git_mock.return_value.execute.assert_called_once_with([
"git", "lfs", "pull", "--include", "path.json"
])
14 changes: 10 additions & 4 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,16 @@ def test_hermetic_loading(dataset_name, record_set_name, num_records):
@pytest.mark.parametrize(
["dataset_name", "record_set_name", "num_records"],
[
["flores-200/metadata.json",
"language_translations_train_data_with_metadata", 10],
["flores-200/metadata.json",
"language_translations_test_data_with_metadata", 10],
[
"flores-200/metadata.json",
"language_translations_train_data_with_metadata",
10,
],
[
"flores-200/metadata.json",
"language_translations_test_data_with_metadata",
10,
],
["gpt-3/metadata.json", "default", 10],
["huggingface-c4/metadata.json", "en", 1],
["huggingface-mnist/metadata.json", "default", 10],
Expand Down
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/nodes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Defines the public interface to the `mlcroissant.nodes` package."""

from mlcroissant._src.structure_graph.nodes.field import Field
from mlcroissant._src.structure_graph.nodes.file_object import FileObject
from mlcroissant._src.structure_graph.nodes.file_set import FileSet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ def _add_operations_for_file_object(
>> Concatenate(operations=operations, node=successor)
]
if node.encoding_format and not should_extract(node.encoding_format):
fields = tuple(
[field for field in node.successors if isinstance(field, Field)]
)
fields = tuple([
field for field in node.successors if isinstance(field, Field)
])
operation >> Read(
operations=operations,
node=node,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ def __call__(self, *args: list[Path]) -> pd.DataFrame:
"""See class' docstring."""
assert len(args) > 0, "No dataframe to merge."
files = [file for files in args for file in files]
return pd.DataFrame(
{
FileProperty.filepath: [os.fspath(file.filepath) for file in files],
FileProperty.filename: [file.filename for file in files],
FileProperty.fullpath: [os.fspath(file.fullpath) for file in files],
}
)
return pd.DataFrame({
FileProperty.filepath: [os.fspath(file.filepath) for file in files],
FileProperty.filename: [file.filename for file in files],
FileProperty.fullpath: [os.fspath(file.fullpath) for file in files],
})
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ def _extract_lines(row: pd.Series) -> pd.Series:
"""Reads a file line-by-line and outputs a named pd.Series of the lines."""
path = epath.Path(row[FileProperty.filepath])
lines = path.open("rb").read().splitlines()
return pd.Series(
{**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))}
)
return pd.Series({
**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))
})


def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
return parse_json_content(json_content, self.fields)
else:
# Raw files are returned as a one-line pd.DataFrame.
return pd.DataFrame(
{
FileProperty.content: [json_content],
}
)
return pd.DataFrame({
FileProperty.content: [json_content],
})
elif encoding_format == EncodingFormat.JSON_LINES:
return pd.read_json(file, lines=True)
elif encoding_format == EncodingFormat.PARQUET:
Expand All @@ -119,11 +117,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
filepath, header=None, names=[FileProperty.lines]
)
else:
return pd.DataFrame(
{
FileProperty.content: [file.read()],
}
)
return pd.DataFrame({
FileProperty.content: [file.read()],
})
else:
raise ValueError(
f"Unsupported encoding format for file: {encoding_format}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,10 @@ def from_jsonld(cls, issues: Issues, json_) -> ParentField | None:

def to_json(self) -> Json:
"""Converts the `ParentField` to JSON."""
return remove_empty_values(
{
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
}
)
return remove_empty_values({
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
})


@dataclasses.dataclass(eq=False, repr=False)
Expand Down Expand Up @@ -128,20 +126,18 @@ def to_json(self) -> Json:
self.rdf.shorten_value(data_type) for data_type in self.data_types
]
parent_field = self.parent_field.to_json() if self.parent_field else None
return remove_empty_values(
{
"@type": "ml:Field",
"name": self.name,
"description": self.description,
"dataType": data_types[0] if len(data_types) == 1 else data_types,
"isEnumeration": self.is_enumeration,
"parentField": parent_field,
"repeated": self.repeated,
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
"subField": [sub_field.to_json() for sub_field in self.sub_fields],
}
)
return remove_empty_values({
"@type": "ml:Field",
"name": self.name,
"description": self.description,
"dataType": data_types[0] if len(data_types) == 1 else data_types,
"isEnumeration": self.is_enumeration,
"parentField": parent_field,
"repeated": self.repeated,
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
"subField": [sub_field.to_json() for sub_field in self.sub_fields],
})

@classmethod
def from_jsonld(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,18 @@ def to_json(self) -> Json:
contained_in: str | list[str] = self.contained_in[0]
else:
contained_in = self.contained_in
return remove_empty_values(
{
"@type": "sc:FileObject",
"name": self.name,
"description": self.description,
"contentSize": self.content_size,
"contentUrl": self.content_url,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"md5": self.md5,
"sha256": self.sha256,
"source": self.source.to_json() if self.source else None,
}
)
return remove_empty_values({
"@type": "sc:FileObject",
"name": self.name,
"description": self.description,
"contentSize": self.content_size,
"contentUrl": self.content_url,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"md5": self.md5,
"sha256": self.sha256,
"source": self.source.to_json() if self.source else None,
})

@classmethod
def from_jsonld(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ def test_checks_are_performed():
Node, "assert_has_exclusive_properties"
) as exclusive_mock:
create_test_node(FileObject)
mandatory_mock.assert_has_calls(
[mock.call("encoding_format", "name"), mock.call("content_url")]
)
mandatory_mock.assert_has_calls([
mock.call("encoding_format", "name"), mock.call("content_url")
])
exclusive_mock.assert_called_once_with(["md5", "sha256"])
validate_name_mock.assert_called_once()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,14 @@ def to_json(self) -> Json:
contained_in: str | list[str] = self.contained_in[0]
else:
contained_in = self.contained_in
return remove_empty_values(
{
"@type": "sc:FileSet",
"name": self.name,
"description": self.description,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"includes": self.includes,
}
)
return remove_empty_values({
"@type": "sc:FileSet",
"name": self.name,
"description": self.description,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"includes": self.includes,
})

@classmethod
def from_jsonld(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,17 @@ def __post_init__(self):

def to_json(self) -> Json:
"""Converts the `Metadata` to JSON."""
return remove_empty_values(
{
"@context": self.rdf.context,
"@type": "sc:Dataset",
"name": self.name,
"description": self.description,
"citation": self.citation,
"license": self.license,
"url": self.url,
"distribution": [f.to_json() for f in self.distribution],
"recordSet": [record_set.to_json() for record_set in self.record_sets],
}
)
return remove_empty_values({
"@context": self.rdf.context,
"@type": "sc:Dataset",
"name": self.name,
"description": self.description,
"citation": self.citation,
"license": self.license,
"url": self.url,
"distribution": [f.to_json() for f in self.distribution],
"recordSet": [record_set.to_json() for record_set in self.record_sets],
})

@property
def file_objects(self) -> list[FileObject]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,15 @@ def __post_init__(self):

def to_json(self) -> Json:
"""Converts the `RecordSet` to JSON."""
return remove_empty_values(
{
"@type": "ml:RecordSet",
"name": self.name,
"description": self.description,
"isEnumeration": self.is_enumeration,
"key": self.key,
"field": [field.to_json() for field in self.fields],
"data": self.data,
}
)
return remove_empty_values({
"@type": "ml:RecordSet",
"name": self.name,
"description": self.description,
"isEnumeration": self.is_enumeration,
"key": self.key,
"field": [field.to_json() for field in self.fields],
"data": self.data,
})

@classmethod
def from_jsonld(
Expand Down
Loading
Loading