Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support merging output from multiple file keys when writing to stdout #115

Merged
merged 11 commits into from
Oct 22, 2024
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ ENV_NAME="cudf_test"
rapids-dependency-file-generator \
--file-key "test" \
--output "conda" \
--matrix "cuda=11.5;arch=$(arch)" > env.yaml
--matrix "cuda=12.5;arch=$(arch)" > env.yaml
mamba env create --file env.yaml
mamba activate "$ENV_NAME"

Expand All @@ -335,6 +335,16 @@ The `--file-key`, `--output`, and `--matrix` flags must be used together. `--mat

Where multiple values for the same key are passed to `--matrix`, e.g. `cuda_suffixed=true;cuda_suffixed=false`, only the last value will be used.

Where `--file-key` is supplied multiple times in the same invocation, the output printed to `stdout` will contain a union (without duplicates) of all of the corresponding dependencies. For example:

```shell
rapids-dependency-file-generator \
--file-key "test" \
--file-key "test_notebooks" \
--output "conda" \
--matrix "cuda=12.5;arch=$(arch)" > env.yaml
```

The `--prepend-channel` argument accepts additional channels to use, like `rapids-dependency-file-generator --prepend-channel my_channel --prepend-channel my_other_channel`.
If both `--output` and `--prepend-channel` are provided, the output format must be conda.
Prepending channels can be useful for adding local channels with packages to be tested in CI workflows.
Expand Down
8 changes: 6 additions & 2 deletions src/rapids_dependency_file_generator/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ def validate_args(argv):
codependent_args = parser.add_argument_group("optional, but codependent")
codependent_args.add_argument(
"--file-key",
help="The file key from `dependencies.yaml` to generate.",
action="append",
help=(
"The file key from `dependencies.yaml` to generate. "
"If supplied multiple times, dependency lists from all requested file keys will be merged."
),
)
codependent_args.add_argument(
"--output",
Expand Down Expand Up @@ -109,7 +113,7 @@ def main(argv=None) -> None:
to_stdout = all([args.file_key, args.output, args.matrix is not None])

if to_stdout:
file_keys = [args.file_key]
file_keys = args.file_key
output = {Output(args.output)}
else:
file_keys = list(parsed_config.files.keys())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import textwrap
import typing
from collections.abc import Generator
from dataclasses import dataclass

import tomlkit
import yaml
Expand Down Expand Up @@ -95,27 +96,32 @@ def grid(gridspec: dict[str, list[str]]) -> Generator[dict[str, str], None, None
def make_dependency_file(
*,
file_type: _config.Output,
name: os.PathLike,
conda_env_name: str,
file_name: str,
config_file: os.PathLike,
output_dir: os.PathLike,
conda_channels: list[str],
dependencies: typing.Sequence[typing.Union[str, dict[str, list[str]]]],
extras: typing.Union[_config.FileExtras, None],
):
) -> str:
"""Generate the contents of the dependency file.

Parameters
----------
file_type : Output
An Output value used to determine the file type.
name : PathLike
The name of the file to write.
conda_env_name : str
Name to put in the 'name: ' field when generating conda environment YAML files.
Only used when ``file_type`` is CONDA.
file_name : str
Name of a file in ``output_dir`` to read in.
Only used when ``file_type`` is PYPROJECT.
config_file : PathLike
The full path to the dependencies.yaml file.
output_dir : PathLike
The path to the directory where the dependency files will be written.
conda_channels : list[str]
The channels to include in the file. Only used when `file_type` is
The channels to include in the file. Only used when ``file_type`` is
CONDA.
dependencies : Sequence[str | dict[str, list[str]]]
The dependencies to include in the file.
Expand All @@ -137,7 +143,7 @@ def make_dependency_file(
if file_type == _config.Output.CONDA:
file_contents += yaml.dump(
{
"name": os.path.splitext(name)[0],
"name": conda_env_name,
"channels": conda_channels,
"dependencies": dependencies,
}
Expand Down Expand Up @@ -173,7 +179,7 @@ def make_dependency_file(
key = extras.key

# This file type needs to be modified in place instead of built from scratch.
with open(os.path.join(output_dir, name)) as f:
with open(os.path.join(output_dir, file_name)) as f:
file_contents_toml = tomlkit.load(f)

toml_deps = tomlkit.array()
Expand Down Expand Up @@ -320,6 +326,32 @@ def should_use_specific_entry(matrix_combo: dict[str, str], specific_entry_matri
)


@dataclass
class _DependencyCollection:
str_deps: set[str]
# e.g. {"pip": ["dgl", "pyg"]}, used in conda envs
dict_deps: dict[str, list[str]]

def update(self, deps: typing.Sequence[typing.Union[str, dict[str, list[str]]]]) -> None:
for dep in deps:
if isinstance(dep, dict):
for k, v in dep.items():
if k in self.dict_deps:
self.dict_deps[k].extend(v)
self.dict_deps[k] = sorted(set(self.dict_deps[k]))
else:
self.dict_deps[k] = v
else:
self.str_deps.add(dep)

@property
def deps_list(self) -> typing.Sequence[typing.Union[str, dict[str, list[str]]]]:
if self.dict_deps:
return [*sorted(self.str_deps), self.dict_deps]

return [*sorted(self.str_deps)]


def make_dependency_files(
*,
parsed_config: _config.Config,
Expand Down Expand Up @@ -360,6 +392,19 @@ def make_dependency_files(
If the file is malformed. There are numerous different error cases
which are described by the error messages.
"""
if to_stdout and len(file_keys) > 1 and output is not None and _config.Output.PYPROJECT in output:
raise ValueError(
f"Using --file-key multiple times together with '--output {_config.Output.PYPROJECT.value}' "
"when writing to stdout is not supported."
)

# the list of conda channels does not depend on individual file keys
conda_channels = prepend_channels + parsed_config.channels

# initialize a container for "all dependencies found across all files", to support
# passing multiple files keys and writing a merged result to stdout
all_dependencies = _DependencyCollection(str_deps=set(), dict_deps={})

for file_key in file_keys:
file_config = parsed_config.files[file_key]
file_types_to_generate = file_config.output if output is None else output
Expand Down Expand Up @@ -438,18 +483,50 @@ def make_dependency_files(
)
contents = make_dependency_file(
file_type=file_type,
name=full_file_name,
conda_env_name=os.path.splitext(full_file_name)[0],
file_name=full_file_name,
config_file=parsed_config.path,
output_dir=output_dir,
conda_channels=prepend_channels + parsed_config.channels,
conda_channels=conda_channels,
dependencies=deduped_deps,
extras=file_config.extras,
)

if to_stdout:
print(contents)
if len(file_keys) == 1:
print(contents)
else:
all_dependencies.update(deduped_deps)
else:
os.makedirs(output_dir, exist_ok=True)
file_path = os.path.join(output_dir, full_file_name)
with open(file_path, "w") as f:
f.write(contents)

# create one unified output from all the file_keys, and print it to stdout
if to_stdout and len(file_keys) > 1:
# convince mypy that 'output' is not None here
#
# 'output' is technically a set because of https://github.com/rapidsai/dependency-file-generator/pull/74,
# but since https://github.com/rapidsai/dependency-file-generator/pull/79 it's only ever one of the following:
#
# - an exactly-1-item set (stdout=True, or when used by rapids-build-backend)
# - 'None' (stdout=False)
#
err_msg = (
"Exactly 1 output type should be provided when asking rapids-dependency-file-generator to write to stdout. "
"If you see this, you've found a bug. Please report it at https://github.com/rapidsai/dependency-file-generator/issues."
jameslamb marked this conversation as resolved.
Show resolved Hide resolved
)
assert output is not None, err_msg

contents = make_dependency_file(
file_type=output.pop(),
conda_env_name="rapids-dfg-combined",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name: field in these generated conda environments is ignored in every workflow I'm aware of in RAPIDS... it's always overwritten by -n / --name passed through like this:

rapids-dependency-file-generator \
  --output conda \
  --file-key test_cpp \
  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"

mamba env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

That's why I'm proposing a hard-coded mildly-informative name instead of taking on the complexity of other alternatives, like:

  • allowing conda_env_name=None to be passed to make_dependency_file()
  • trying to come up with some other representative environment name based on which --file-keys were passed

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’d weakly prefer allowing None here. I don’t think conda environment files require a name? Maybe we can just omit it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I just pushed df3848d changing this to allow None.

I don't think conda environment files require a name?

Correct.

cat > env.yaml <<EOF
channels:
- conda-forge
dependencies:
  - pandas
  - pip
  - pip:
    - scikit-learn
EOF

conda env create \
  --name delete-me \
  --file ./env.yaml

Works without issue

...
done
#
# To activate this environment, use
#
#     $ conda activate delete-me
#
# To deactivate an active environment, use
#
#     $ conda deactivat

file_name="ignored-because-multiple-pyproject-files-are-not-supported",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make_dependency_file() is already really 3 distinct codepaths inside one function. The codepath for generating pyproject output rightly expects to be given a path to an existing pyproject.toml file to read in.

I'm proposing just not allowing multiple --file-key together with --output pyproject and passing through this hard-coded nonsense string instead of taking on the complexity of other options like:

  • breaking make_dependency_file() up into e.g. make_conda_env_file(), make_requirements_file(), and make_pyproject_file()
  • allowing file_name=None to be passed to make_dependency_file() and generating TOML content from scratch if it is

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to read this 3 times but I think I understand all of the information in this comment? I don't have any feedback, I think this is maybe-fine-enough and if not we can fix it later.

Copy link
Member Author

@jameslamb jameslamb Oct 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me try a different way with links.

After processing dependencies.yaml, filtering stuff by output type and matrices, deduplicating, etc., DFG wants the literal text that it'll either echo to stdout or write to a file.

For that, it calls make_dependency_file():

Despite the name, that function actually just creates and returns that text... it never writes anything to the filesystem.

The body of the function, in pseudocode, is like:

def make_dependency_file(...):
    relative_path_to_config_file = os.path.relpath(config_file, output_dir)
    file_contents = textwrap.dedent(
        f"""\
        {HEADER}
        # To make changes, edit {relative_path_to_config_file} and run `{cli_name}`.
        """
    )
    if conda:
        # {code specific to conda env YAML files}
    elif requirements:
        # {code specific to requirements.txt}
    elif pyproject:
        # {code specific to pyproject.toml}

    return file_contents

So there's very little shared code in there. The list of arguments for the function contains a mix of things that are only used for some but not all of the output types.

Like this:

conda_channels : list[str]
The channels to include in the file. Only used when `file_type` is
CONDA.

That's complex but has been kind of "fine" as long as this was always being called with the data from a single --file-key.

For this PR, we now want to use that function to generate a single type of output from multiple --file-key entries. For that, there's no single representative output_dir, for example. Here, I've just kind of awkwardly provided values with the right types so mypy will be happy, with the knowledge that they don't really matter (since most of the arguments are pyproject-specific, and this PR isn't supporting pyproject.toml).

It'd be clearer if make_dependency_file() was decomposed into make_conda_env_file(), make_requirements_file() and make_pyproject_file(). But I really really did not want to take that on in the scope of this PR.

config_file=parsed_config.path,
output_dir=parsed_config.path,
conda_channels=conda_channels,
dependencies=all_dependencies.deps_list,
extras=None,
)
print(contents)
82 changes: 82 additions & 0 deletions tests/examples/overlapping-deps/dependencies.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
files:
build_deps:
output: [pyproject]
pyproject_dir: output/actual
extras:
table: build-system
includes:
- rapids_build_skbuild
- depends_on_numpy
even_more_build_deps:
output: [pyproject]
pyproject_dir: output/actual
extras:
table: tool.rapids-build-backend
key: requires
includes:
- depends_on_numpy
- depends_on_pandas
test_deps:
output: none
includes:
- depends_on_numpy
- depends_on_pandas
even_more_test_deps:
output: none
includes:
- depends_on_numpy
- test_python
test_with_sklearn:
output: none
includes:
- depends_on_scikit_learn
channels:
- rapidsai
- conda-forge
dependencies:
depends_on_numpy:
common:
- output_types: [requirements, pyproject]
packages:
- numpy>=2.0
# using 'pip' intentionally to test handling of that nested list
- output_types: [conda]
packages:
- pip
- pip:
- numpy >=2.0
depends_on_pandas:
common:
- output_types: [conda, requirements, pyproject]
packages:
- pandas<3.0
depends_on_scikit_learn:
common:
- output_types: [conda, requirements, pyproject]
packages:
- scikit-learn>=1.5
test_python:
common:
- output_types: [conda, requirements, pyproject]
packages:
- matplotlib
- output_types: [conda]
packages:
- pip
# intentional overlap (numpy) with depends_on_numpy's pip list, to
# test that pip dependencies don't have duplicates
- pip:
# intentionally not in alphabetical order
- numpy >=2.0
- folium
rapids_build_skbuild:
common:
- output_types: [conda, requirements, pyproject]
packages:
- rapids-build-backend>=0.3.1
- output_types: [requirements, pyproject]
packages:
- scikit-build-core[pyproject]>=0.9.0
- output_types: [conda]
packages:
- scikit-build-core>=0.9.0
20 changes: 20 additions & 0 deletions tests/examples/overlapping-deps/output/expected/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[build-system]
build-backend = "rapids_build_backend.build_meta"
requires = [
"numpy>=2.0",
"rapids-build-backend>=0.3.1",
"scikit-build-core[pyproject]>=0.9.0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

[project]
name = "libbeepboop"
version = "0.1.2"
dependencies = [
"scipy",
]

[tool.rapids-build-backend]
requires = [
"numpy>=2.0",
"pandas<3.0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
44 changes: 44 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,47 @@ def test_validate_args():
"all",
]
)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you are wondering "why isn't there a test case here checking that passing multiple --file-key together with --output pyproject is rejected?"... it's because I put the exception-raising code for that down further, in make_dependency_file(), so that the exception would also be raised if rapids-build-backend (which does not use the CLI) tried to pass inputs like that.

I did add a unit test in test_rapids_dependency_file_generator checking that that exception is raised as expected.

# Valid, with 2 files for --output requirements
validate_args(
[
"--output",
"requirements",
"--matrix",
"cuda=12.5",
"--file-key",
"all",
"--file-key",
"test_python",
]
)

# Valid, with 2 files for --output conda
validate_args(
[
"--output",
"conda",
"--matrix",
"cuda=12.5",
"--file-key",
"all",
"--file-key",
"test_python",
]
)

# Valid, with 3 files
validate_args(
[
"--output",
"requirements",
"--matrix",
"cuda=12.5",
"--file-key",
"all",
"--file-key",
"test_python",
"--file-key",
"build_python",
]
)
Loading
Loading